postgresql/src/backend/access/heap/heapam.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

10229 lines
315 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* heapam.c
* heap access method code
*
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/access/heap/heapam.c
*
*
* INTERFACE ROUTINES
* heap_beginscan - begin relation scan
* heap_rescan - restart a relation scan
* heap_endscan - end relation scan
* heap_getnext - retrieve next tuple in scan
* heap_fetch - retrieve tuple with given tid
* heap_insert - insert tuple into a relation
* heap_multi_insert - insert multiple tuples into a relation
* heap_delete - delete a tuple from a relation
* heap_update - replace a tuple in a relation with another tuple
*
* NOTES
* This file contains the heap_ routines which implement
* the POSTGRES heap access method used for all POSTGRES
* relations.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/bufmask.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/heapam_xlog.h"
#include "access/heaptoast.h"
#include "access/hio.h"
#include "access/multixact.h"
#include "access/parallel.h"
#include "access/relscan.h"
#include "access/subtrans.h"
#include "access/syncscan.h"
#include "access/sysattr.h"
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
#include "access/tableam.h"
#include "access/transam.h"
1999-07-16 07:00:38 +02:00
#include "access/valid.h"
#include "access/visibilitymap.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "commands/vacuum.h"
1999-07-16 07:00:38 +02:00
#include "miscadmin.h"
#include "pgstat.h"
#include "port/atomics.h"
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
#include "port/pg_bitutils.h"
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
#include "storage/predicate.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "storage/spin.h"
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
#include "storage/standby.h"
#include "utils/datum.h"
1999-07-16 07:00:38 +02:00
#include "utils/inval.h"
#include "utils/lsyscache.h"
1999-07-16 07:00:38 +02:00
#include "utils/relcache.h"
#include "utils/snapmgr.h"
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#include "utils/spccache.h"
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
TransactionId xid, CommandId cid, int options);
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Buffer newbuf, HeapTuple oldtup,
HeapTuple newtup, HeapTuple old_key_tuple,
bool all_visible_cleared, bool new_all_visible_cleared);
static Bitmapset *HeapDetermineColumnsInfo(Relation relation,
Bitmapset *interesting_cols,
Bitmapset *external_cols,
HeapTuple oldtup, HeapTuple newtup,
bool *has_external);
static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
LockTupleMode mode, LockWaitPolicy wait_policy,
bool *have_tuple_lock);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
uint16 old_infomask2, TransactionId add_to_xmax,
LockTupleMode mode, bool is_update,
TransactionId *result_xmax, uint16 *result_infomask,
uint16 *result_infomask2);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ItemPointer ctid, TransactionId xid,
LockTupleMode mode);
static int heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
xl_heap_freeze_plan *plans_out,
OffsetNumber *offsets_out);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
uint16 *new_infomask2);
static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
uint16 t_infomask);
static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
LockTupleMode lockmode, bool *current_is_member);
static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
Relation rel, ItemPointer ctid, XLTW_Oper oper,
int *remaining);
static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
uint16 infomask, Relation rel, int *remaining);
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
static void index_delete_sort(TM_IndexDeleteOp *delstate);
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
bool *copy);
1996-10-21 07:59:49 +02:00
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Each tuple lock mode has a corresponding heavyweight lock, and one or two
* corresponding MultiXactStatuses (one to merely lock tuples, another one to
* update them). This table (and the macros below) helps us determine the
* heavyweight lock mode and MultiXactStatus values to use for any particular
* tuple lock strength.
*
* Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
* instead.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
static const struct
{
LOCKMODE hwlock;
int lockstatus;
int updstatus;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
tupleLockExtraInfo[MaxLockTupleMode + 1] =
{
{ /* LockTupleKeyShare */
AccessShareLock,
MultiXactStatusForKeyShare,
-1 /* KeyShare does not allow updating tuples */
},
{ /* LockTupleShare */
RowShareLock,
MultiXactStatusForShare,
-1 /* Share does not allow updating tuples */
},
{ /* LockTupleNoKeyExclusive */
ExclusiveLock,
MultiXactStatusForNoKeyUpdate,
MultiXactStatusNoKeyUpdate
},
{ /* LockTupleExclusive */
AccessExclusiveLock,
MultiXactStatusForUpdate,
MultiXactStatusUpdate
}
};
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/* Get the LOCKMODE for a given MultiXactStatus */
#define LOCKMODE_from_mxstatus(status) \
(tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
/*
* Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
* This is more readable than having every caller translate it to lock.h's
* LOCKMODE.
*/
#define LockTupleTuplock(rel, tup, mode) \
LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
#define UnlockTupleTuplock(rel, tup, mode) \
UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
#define ConditionalLockTupleTuplock(rel, tup, mode) \
ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#ifdef USE_PREFETCH
/*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* heap_index_delete_tuples and index_delete_prefetch_buffer use this
* structure to coordinate prefetching activity
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
typedef struct
{
BlockNumber cur_hblkno;
int next_item;
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
int ndeltids;
TM_IndexDelete *deltids;
} IndexDeletePrefetchState;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#endif
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
/* heap_index_delete_tuples bottom-up index deletion costing constants */
#define BOTTOMUP_MAX_NBLOCKS 6
#define BOTTOMUP_TOLERANCE_NBLOCKS 3
/*
* heap_index_delete_tuples uses this when determining which heap blocks it
* must visit to help its bottom-up index deletion caller
*/
typedef struct IndexDeleteCounts
{
int16 npromisingtids; /* Number of "promising" TIDs in group */
int16 ntids; /* Number of TIDs in group */
int16 ifirsttid; /* Offset to group's first deltid */
} IndexDeleteCounts;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* This table maps tuple lock strength values for each particular
* MultiXactStatus value.
*/
static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
{
LockTupleKeyShare, /* ForKeyShare */
LockTupleShare, /* ForShare */
LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
LockTupleExclusive, /* ForUpdate */
LockTupleNoKeyExclusive, /* NoKeyUpdate */
LockTupleExclusive /* Update */
};
/* Get the LockTupleMode for a given MultiXactStatus */
#define TUPLOCK_from_mxstatus(status) \
(MultiXactStatusLock[(status)])
/* ----------------------------------------------------------------
* heap support routines
* ----------------------------------------------------------------
*/
/* ----------------
* initscan - scan code common to heap_beginscan and heap_rescan
* ----------------
*/
static void
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
{
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
ParallelBlockTableScanDesc bpscan = NULL;
bool allow_strat;
bool allow_sync;
/*
* Determine the number of blocks we have to scan.
*
* It is sufficient to do this once at scan start, since any tuples added
* while the scan is in progress will be invisible to my snapshot anyway.
* (That is not true when using a non-MVCC snapshot. However, we couldn't
* guarantee to return tuples added after scan start anyway, since they
* might go into pages we already scanned. To guarantee consistent
* results for a non-MVCC snapshot, the caller must hold some higher-level
* lock that ensures the interesting tuple(s) won't change.)
*/
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
if (scan->rs_base.rs_parallel != NULL)
{
bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
scan->rs_nblocks = bpscan->phs_nblocks;
}
else
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
/*
* If the table is large relative to NBuffers, use a bulk-read access
* strategy and enable synchronized scanning (see syncscan.c). Although
* the thresholds for these features could be different, we make them the
* same so that there are only two behaviors to tune rather than four.
* (However, some callers need to be able to disable one or both of these
* behaviors, independently of the size of the table; also there is a GUC
* variable that can disable synchronized scanning.)
*
* Note that table_block_parallelscan_initialize has a very similar test;
* if you change this, consider changing that one, too.
*/
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
scan->rs_nblocks > NBuffers / 4)
{
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
}
else
allow_strat = allow_sync = false;
if (allow_strat)
{
/* During a rescan, keep the previous strategy object. */
if (scan->rs_strategy == NULL)
scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
}
else
{
if (scan->rs_strategy != NULL)
FreeAccessStrategy(scan->rs_strategy);
scan->rs_strategy = NULL;
}
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
if (scan->rs_base.rs_parallel != NULL)
{
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
/* For parallel scan, believe whatever ParallelTableScanDesc says. */
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (scan->rs_base.rs_parallel->phs_syncscan)
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
}
else if (keep_startblock)
{
/*
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
* When rescanning, we want to keep the previous startblock setting,
* so that rewinding a cursor doesn't generate surprising results.
* Reset the active syncscan setting, though.
*/
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (allow_sync && synchronize_seqscans)
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
}
else if (allow_sync && synchronize_seqscans)
{
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
}
else
{
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
scan->rs_startblock = 0;
}
BRIN: Block Range Indexes BRIN is a new index access method intended to accelerate scans of very large tables, without the maintenance overhead of btrees or other traditional indexes. They work by maintaining "summary" data about block ranges. Bitmap index scans work by reading each summary tuple and comparing them with the query quals; all pages in the range are returned in a lossy TID bitmap if the quals are consistent with the values in the summary tuple, otherwise not. Normal index scans are not supported because these indexes do not store TIDs. As new tuples are added into the index, the summary information is updated (if the block range in which the tuple is added is already summarized) or not; in the latter case, a subsequent pass of VACUUM or the brin_summarize_new_values() function will create the summary information. For data types with natural 1-D sort orders, the summary info consists of the maximum and the minimum values of each indexed column within each page range. This type of operator class we call "Minmax", and we supply a bunch of them for most data types with B-tree opclasses. Since the BRIN code is generalized, other approaches are possible for things such as arrays, geometric types, ranges, etc; even for things such as enum types we could do something different than minmax with better results. In this commit I only include minmax. Catalog version bumped due to new builtin catalog entries. There's more that could be done here, but this is a good step forwards. Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera, with contribution by Heikki Linnakangas. Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas. Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo. PS: The research leading to these results has received funding from the European Union's Seventh Framework Programme (FP7/2007-2013) under grant agreement n° 318633.
2014-11-07 20:38:14 +01:00
scan->rs_numblocks = InvalidBlockNumber;
scan->rs_inited = false;
scan->rs_ctup.t_data = NULL;
ItemPointerSetInvalid(&scan->rs_ctup.t_self);
scan->rs_cbuf = InvalidBuffer;
scan->rs_cblock = InvalidBlockNumber;
/* page-at-a-time fields are always invalid when not rs_inited */
/*
* copy the scan key, if appropriate
*/
if (key != NULL && scan->rs_base.rs_nkeys > 0)
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
/*
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
* Currently, we only have a stats counter for sequential heap scans (but
* e.g for bitmap scans the underlying bitmap index scans will be counted,
* and for sample scans we update stats for tuple fetches).
*/
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
pgstat_count_heap_scan(scan->rs_base.rs_rd);
}
/*
* heap_setscanlimits - restrict range of a heapscan
*
* startBlk is the page to start at
* numBlks is number of pages to scan (InvalidBlockNumber means "all")
*/
BRIN: Block Range Indexes BRIN is a new index access method intended to accelerate scans of very large tables, without the maintenance overhead of btrees or other traditional indexes. They work by maintaining "summary" data about block ranges. Bitmap index scans work by reading each summary tuple and comparing them with the query quals; all pages in the range are returned in a lossy TID bitmap if the quals are consistent with the values in the summary tuple, otherwise not. Normal index scans are not supported because these indexes do not store TIDs. As new tuples are added into the index, the summary information is updated (if the block range in which the tuple is added is already summarized) or not; in the latter case, a subsequent pass of VACUUM or the brin_summarize_new_values() function will create the summary information. For data types with natural 1-D sort orders, the summary info consists of the maximum and the minimum values of each indexed column within each page range. This type of operator class we call "Minmax", and we supply a bunch of them for most data types with B-tree opclasses. Since the BRIN code is generalized, other approaches are possible for things such as arrays, geometric types, ranges, etc; even for things such as enum types we could do something different than minmax with better results. In this commit I only include minmax. Catalog version bumped due to new builtin catalog entries. There's more that could be done here, but this is a good step forwards. Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera, with contribution by Heikki Linnakangas. Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas. Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo. PS: The research leading to these results has received funding from the European Union's Seventh Framework Programme (FP7/2007-2013) under grant agreement n° 318633.
2014-11-07 20:38:14 +01:00
void
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
BRIN: Block Range Indexes BRIN is a new index access method intended to accelerate scans of very large tables, without the maintenance overhead of btrees or other traditional indexes. They work by maintaining "summary" data about block ranges. Bitmap index scans work by reading each summary tuple and comparing them with the query quals; all pages in the range are returned in a lossy TID bitmap if the quals are consistent with the values in the summary tuple, otherwise not. Normal index scans are not supported because these indexes do not store TIDs. As new tuples are added into the index, the summary information is updated (if the block range in which the tuple is added is already summarized) or not; in the latter case, a subsequent pass of VACUUM or the brin_summarize_new_values() function will create the summary information. For data types with natural 1-D sort orders, the summary info consists of the maximum and the minimum values of each indexed column within each page range. This type of operator class we call "Minmax", and we supply a bunch of them for most data types with B-tree opclasses. Since the BRIN code is generalized, other approaches are possible for things such as arrays, geometric types, ranges, etc; even for things such as enum types we could do something different than minmax with better results. In this commit I only include minmax. Catalog version bumped due to new builtin catalog entries. There's more that could be done here, but this is a good step forwards. Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera, with contribution by Heikki Linnakangas. Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas. Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo. PS: The research leading to these results has received funding from the European Union's Seventh Framework Programme (FP7/2007-2013) under grant agreement n° 318633.
2014-11-07 20:38:14 +01:00
{
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
HeapScanDesc scan = (HeapScanDesc) sscan;
Assert(!scan->rs_inited); /* else too late to change */
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
/* else rs_startblock is significant */
Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
/* Check startBlk is valid (but allow case of zero blocks...) */
Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
BRIN: Block Range Indexes BRIN is a new index access method intended to accelerate scans of very large tables, without the maintenance overhead of btrees or other traditional indexes. They work by maintaining "summary" data about block ranges. Bitmap index scans work by reading each summary tuple and comparing them with the query quals; all pages in the range are returned in a lossy TID bitmap if the quals are consistent with the values in the summary tuple, otherwise not. Normal index scans are not supported because these indexes do not store TIDs. As new tuples are added into the index, the summary information is updated (if the block range in which the tuple is added is already summarized) or not; in the latter case, a subsequent pass of VACUUM or the brin_summarize_new_values() function will create the summary information. For data types with natural 1-D sort orders, the summary info consists of the maximum and the minimum values of each indexed column within each page range. This type of operator class we call "Minmax", and we supply a bunch of them for most data types with B-tree opclasses. Since the BRIN code is generalized, other approaches are possible for things such as arrays, geometric types, ranges, etc; even for things such as enum types we could do something different than minmax with better results. In this commit I only include minmax. Catalog version bumped due to new builtin catalog entries. There's more that could be done here, but this is a good step forwards. Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera, with contribution by Heikki Linnakangas. Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas. Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo. PS: The research leading to these results has received funding from the European Union's Seventh Framework Programme (FP7/2007-2013) under grant agreement n° 318633.
2014-11-07 20:38:14 +01:00
scan->rs_startblock = startBlk;
scan->rs_numblocks = numBlks;
}
/*
* heapgetpage - subroutine for heapgettup()
*
* This routine reads and pins the specified page of the relation.
* In page-at-a-time mode it performs additional work, namely determining
* which tuples on the page are visible.
*/
void
heapgetpage(TableScanDesc sscan, BlockNumber block)
{
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
HeapScanDesc scan = (HeapScanDesc) sscan;
Buffer buffer;
Snapshot snapshot;
Page page;
1998-11-27 20:52:36 +01:00
int lines;
int ntup;
1998-11-27 20:52:36 +01:00
OffsetNumber lineoff;
bool all_visible;
Assert(block < scan->rs_nblocks);
/* release previous scan buffer, if any */
if (BufferIsValid(scan->rs_cbuf))
{
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
}
/*
* Be sure to check for interrupts at least once per page. Checks at
* higher code levels won't be able to stop a seqscan that encounters many
* pages' worth of consecutive dead tuples.
*/
CHECK_FOR_INTERRUPTS();
/* read page using selected strategy */
scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, block,
RBM_NORMAL, scan->rs_strategy);
scan->rs_cblock = block;
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
return;
buffer = scan->rs_cbuf;
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
snapshot = scan->rs_base.rs_snapshot;
/*
* Prune and repair fragmentation for the whole page, if possible.
*/
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
/*
* We must hold share lock on the buffer content while examining tuple
* visibility. Afterwards, however, the tuples we have found to be
* visible are guaranteed good as long as we hold the buffer pin.
*/
LockBuffer(buffer, BUFFER_LOCK_SHARE);
page = BufferGetPage(buffer);
TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, page);
lines = PageGetMaxOffsetNumber(page);
ntup = 0;
/*
* If the all-visible flag indicates that all tuples on the page are
* visible to everyone, we can skip the per-tuple visibility tests.
*
* Note: In hot standby, a tuple that's already visible to all
* transactions on the primary might still be invisible to a read-only
* transaction in the standby. We partly handle this problem by tracking
* the minimum xmin of visible tuples as the cut-off XID while marking a
* page all-visible on the primary and WAL log that along with the
* visibility map SET operation. In hot standby, we wait for (or abort)
* all transactions that can potentially may not see one or more tuples on
* the page. That's how index-only scans work fine in hot standby. A
* crucial difference between index-only scans and heap scans is that the
* index-only scan completely relies on the visibility map where as heap
* scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
* the page-level flag can be trusted in the same way, because it might
* get propagated somehow without being explicitly WAL-logged, e.g. via a
* full page write. Until we can prove that beyond doubt, let's check each
* tuple for visibility the hard way.
*/
all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++)
{
ItemId lpp = PageGetItemId(page, lineoff);
HeapTupleData loctup;
bool valid;
if (!ItemIdIsNormal(lpp))
continue;
loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp);
loctup.t_len = ItemIdGetLength(lpp);
ItemPointerSet(&(loctup.t_self), block, lineoff);
if (all_visible)
valid = true;
else
valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
&loctup, buffer, snapshot);
if (valid)
scan->rs_vistuples[ntup++] = lineoff;
}
2001-03-22 05:01:46 +01:00
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
Assert(ntup <= MaxHeapTuplesPerPage);
scan->rs_ntuples = ntup;
}
/*
* heapgettup_initial_block - return the first BlockNumber to scan
*
* Returns InvalidBlockNumber when there are no blocks to scan. This can
* occur with empty tables and in parallel scans when parallel workers get all
* of the pages before we can get a chance to get our first page.
*/
static BlockNumber
heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
{
Assert(!scan->rs_inited);
/* When there are no pages to scan, return InvalidBlockNumber */
if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
return InvalidBlockNumber;
if (ScanDirectionIsForward(dir))
{
/* serial scan */
if (scan->rs_base.rs_parallel == NULL)
return scan->rs_startblock;
else
{
/* parallel scan */
table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
scan->rs_parallelworkerdata,
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel);
/* may return InvalidBlockNumber if there are no more blocks */
return table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
scan->rs_parallelworkerdata,
(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel);
}
}
else
{
/* backward parallel scan not supported */
Assert(scan->rs_base.rs_parallel == NULL);
/*
* Disable reporting to syncscan logic in a backwards scan; it's not
* very likely anyone else is doing the same thing at the same time,
* and much more likely that we'll just bollix things for forward
* scanners.
*/
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
/*
* Start from last page of the scan. Ensure we take into account
* rs_numblocks if it's been adjusted by heap_setscanlimits().
*/
if (scan->rs_numblocks != InvalidBlockNumber)
return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
if (scan->rs_startblock > 0)
return scan->rs_startblock - 1;
return scan->rs_nblocks - 1;
}
}
/*
* heapgettup_start_page - helper function for heapgettup()
*
* Return the next page to scan based on the scan->rs_cbuf and set *linesleft
* to the number of tuples on this page. Also set *lineoff to the first
* offset to scan with forward scans getting the first offset and backward
* getting the final offset on the page.
*/
static Page
heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft,
OffsetNumber *lineoff)
{
Page page;
Assert(scan->rs_inited);
Assert(BufferIsValid(scan->rs_cbuf));
/* Caller is responsible for ensuring buffer is locked if needed */
page = BufferGetPage(scan->rs_cbuf);
TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
*linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1;
if (ScanDirectionIsForward(dir))
*lineoff = FirstOffsetNumber;
else
*lineoff = (OffsetNumber) (*linesleft);
/* lineoff now references the physically previous or next tid */
return page;
}
/*
* heapgettup_continue_page - helper function for heapgettup()
*
* Return the next page to scan based on the scan->rs_cbuf and set *linesleft
* to the number of tuples left to scan on this page. Also set *lineoff to
* the next offset to scan according to the ScanDirection in 'dir'.
*/
static inline Page
heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft,
OffsetNumber *lineoff)
{
Page page;
Assert(scan->rs_inited);
Assert(BufferIsValid(scan->rs_cbuf));
/* Caller is responsible for ensuring buffer is locked if needed */
page = BufferGetPage(scan->rs_cbuf);
TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
if (ScanDirectionIsForward(dir))
{
*lineoff = OffsetNumberNext(scan->rs_coffset);
*linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
}
else
{
/*
* The previous returned tuple may have been vacuumed since the
* previous scan when we use a non-MVCC snapshot, so we must
* re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
*/
*lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset));
*linesleft = *lineoff;
}
/* lineoff now references the physically previous or next tid */
return page;
}
/*
* heapgettup_advance_block - helper for heapgettup() and heapgettup_pagemode()
*
* Given the current block number, the scan direction, and various information
* contained in the scan descriptor, calculate the BlockNumber to scan next
* and return it. If there are no further blocks to scan, return
* InvalidBlockNumber to indicate this fact to the caller.
*
* This should not be called to determine the initial block number -- only for
* subsequent blocks.
*
* This also adjusts rs_numblocks when a limit has been imposed by
* heap_setscanlimits().
*/
static inline BlockNumber
heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
{
if (ScanDirectionIsForward(dir))
{
if (scan->rs_base.rs_parallel == NULL)
{
block++;
/* wrap back to the start of the heap */
if (block >= scan->rs_nblocks)
block = 0;
/* we're done if we're back at where we started */
if (block == scan->rs_startblock)
return InvalidBlockNumber;
/* check if the limit imposed by heap_setscanlimits() is met */
if (scan->rs_numblocks != InvalidBlockNumber)
{
if (--scan->rs_numblocks == 0)
return InvalidBlockNumber;
}
/*
* Report our new scan position for synchronization purposes. We
* don't do that when moving backwards, however. That would just
* mess up any other forward-moving scanners.
*
* Note: we do this before checking for end of scan so that the
* final state of the position hint is back at the start of the
* rel. That's not strictly necessary, but otherwise when you run
* the same query multiple times the starting position would shift
* a little bit backwards on every invocation, which is confusing.
* We don't guarantee any specific ordering in general, though.
*/
if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
ss_report_location(scan->rs_base.rs_rd, block);
return block;
}
else
{
return table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
scan->rs_parallelworkerdata, (ParallelBlockTableScanDesc)
scan->rs_base.rs_parallel);
}
}
else
{
/* we're done if the last block is the start position */
if (block == scan->rs_startblock)
return InvalidBlockNumber;
/* check if the limit imposed by heap_setscanlimits() is met */
if (scan->rs_numblocks != InvalidBlockNumber)
{
if (--scan->rs_numblocks == 0)
return InvalidBlockNumber;
}
/* wrap to the end of the heap when the last page was page 0 */
if (block == 0)
block = scan->rs_nblocks;
block--;
return block;
}
}
/* ----------------
* heapgettup - fetch next heap tuple
*
* Initialize the scan if not already done; then advance to the next
* tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
* or set scan->rs_ctup.t_data = NULL if no more tuples.
*
* Note: the reason nkeys/key are passed separately, even though they are
* kept in the scan descriptor, is that the caller may not want us to check
* the scankeys.
*
* Note: when we fall off the end of the scan in either direction, we
* reset rs_inited. This means that a further request with the same
* scan direction will restart the scan, which is a bit odd, but a
* request with the opposite scan direction will start a fresh scan
* in the proper direction. The latter is required behavior for cursors,
* while the former case is generally undefined behavior in Postgres
* so we don't care too much.
* ----------------
*/
static void
heapgettup(HeapScanDesc scan,
ScanDirection dir,
int nkeys,
ScanKey key)
{
HeapTuple tuple = &(scan->rs_ctup);
BlockNumber block;
Page page;
OffsetNumber lineoff;
int linesleft;
if (unlikely(!scan->rs_inited))
{
block = heapgettup_initial_block(scan, dir);
/* ensure rs_cbuf is invalid when we get InvalidBlockNumber */
Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf));
scan->rs_inited = true;
}
else
{
/* continue from previously returned page/tuple */
block = scan->rs_cblock;
2007-11-15 22:14:46 +01:00
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
goto continue_page;
}
/*
* advance the scan until we find a qualifying tuple or run out of stuff
* to scan
*/
while (block != InvalidBlockNumber)
{
heapgetpage((TableScanDesc) scan, block);
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
continue_page:
/*
* Only continue scanning the page while we have lines left.
*
* Note that this protects us from accessing line pointers past
* PageGetMaxOffsetNumber(); both for forward scans when we resume the
* table scan, and for when we start scanning a new page.
*/
for (; linesleft > 0; linesleft--, lineoff += dir)
{
bool visible;
ItemId lpp = PageGetItemId(page, lineoff);
if (!ItemIdIsNormal(lpp))
continue;
tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
tuple->t_len = ItemIdGetLength(lpp);
ItemPointerSet(&(tuple->t_self), block, lineoff);
visible = HeapTupleSatisfiesVisibility(tuple,
scan->rs_base.rs_snapshot,
scan->rs_cbuf);
HeapCheckForSerializableConflictOut(visible, scan->rs_base.rs_rd,
tuple, scan->rs_cbuf,
scan->rs_base.rs_snapshot);
/* skip tuples not visible to this snapshot */
if (!visible)
continue;
/* skip any tuples that don't match the scan key */
if (key != NULL &&
!HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
nkeys, key))
continue;
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
scan->rs_coffset = lineoff;
return;
}
/*
* if we get here, it means we've exhausted the items on this page and
* it's time to move to the next.
*/
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
/* get the BlockNumber to scan next */
block = heapgettup_advance_block(scan, block, dir);
}
/* end of scan */
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
scan->rs_cblock = InvalidBlockNumber;
tuple->t_data = NULL;
scan->rs_inited = false;
}
/* ----------------
* heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
*
* Same API as heapgettup, but used in page-at-a-time mode
*
* The internal logic is much the same as heapgettup's too, but there are some
* differences: we do not take the buffer content lock (that only needs to
* happen inside heapgetpage), and we iterate through just the tuples listed
* in rs_vistuples[] rather than all tuples on the page. Notice that
* lineindex is 0-based, where the corresponding loop variable lineoff in
* heapgettup is 1-based.
* ----------------
*/
static void
heapgettup_pagemode(HeapScanDesc scan,
ScanDirection dir,
int nkeys,
ScanKey key)
{
HeapTuple tuple = &(scan->rs_ctup);
BlockNumber block;
Page page;
int lineindex;
int linesleft;
if (unlikely(!scan->rs_inited))
{
block = heapgettup_initial_block(scan, dir);
/* ensure rs_cbuf is invalid when we get InvalidBlockNumber */
Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf));
scan->rs_inited = true;
}
else
{
/* continue from previously returned page/tuple */
block = scan->rs_cblock; /* current page */
page = BufferGetPage(scan->rs_cbuf);
TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
2007-11-15 22:14:46 +01:00
lineindex = scan->rs_cindex + dir;
if (ScanDirectionIsForward(dir))
linesleft = scan->rs_ntuples - lineindex;
else
linesleft = scan->rs_cindex;
/* lineindex now references the next or previous visible tid */
goto continue_page;
}
/*
* advance the scan until we find a qualifying tuple or run out of stuff
* to scan
*/
while (block != InvalidBlockNumber)
{
heapgetpage((TableScanDesc) scan, block);
page = BufferGetPage(scan->rs_cbuf);
TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
linesleft = scan->rs_ntuples;
lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1;
/* lineindex now references the next or previous visible tid */
continue_page:
for (; linesleft > 0; linesleft--, lineindex += dir)
{
ItemId lpp;
OffsetNumber lineoff;
lineoff = scan->rs_vistuples[lineindex];
lpp = PageGetItemId(page, lineoff);
Assert(ItemIdIsNormal(lpp));
tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
tuple->t_len = ItemIdGetLength(lpp);
ItemPointerSet(&(tuple->t_self), block, lineoff);
/* skip any tuples that don't match the scan key */
if (key != NULL &&
!HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
nkeys, key))
continue;
scan->rs_cindex = lineindex;
return;
}
/* get the BlockNumber to scan next */
block = heapgettup_advance_block(scan, block, dir);
}
/* end of scan */
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
scan->rs_cbuf = InvalidBuffer;
scan->rs_cblock = InvalidBlockNumber;
tuple->t_data = NULL;
scan->rs_inited = false;
}
/* ----------------------------------------------------------------
* heap access method interface
* ----------------------------------------------------------------
*/
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
TableScanDesc
heap_beginscan(Relation relation, Snapshot snapshot,
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
int nkeys, ScanKey key,
ParallelTableScanDesc parallel_scan,
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
uint32 flags)
{
HeapScanDesc scan;
/*
* increment relation ref count while scanning relation
*
* This is just to make really sure the relcache entry won't go away while
* the scan has a pointer to it. Caller should be holding the rel open
* anyway, so this is redundant in all normal scenarios...
*/
RelationIncrementReferenceCount(relation);
/*
* allocate and initialize scan descriptor
*/
scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
scan->rs_base.rs_rd = relation;
scan->rs_base.rs_snapshot = snapshot;
scan->rs_base.rs_nkeys = nkeys;
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
scan->rs_base.rs_flags = flags;
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
scan->rs_base.rs_parallel = parallel_scan;
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
scan->rs_strategy = NULL; /* set in initscan */
/*
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
* Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
*/
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (!(snapshot && IsMVCCSnapshot(snapshot)))
scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
/*
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
* For seqscan and sample scans in a serializable transaction, acquire a
* predicate lock on the entire relation. This is required not only to
* lock all the matching tuples, but also to conflict with new insertions
* into the table. In an indexscan, we take page locks on the index pages
* covering the range specified in the scan qual, but in a heap scan there
* is nothing more fine-grained to lock. A bitmap scan is a different
* story, there we have already scanned the index and locked the index
* pages covering the predicate. But in that case we still have to lock
* any matching heap tuples. For sample scan we could optimize the locking
* to be at least page-level granularity, but we'd need to add per-tuple
* locking for that.
*/
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
{
/*
* Ensure a missing snapshot is noticed reliably, even if the
* isolation mode means predicate locking isn't performed (and
* therefore the snapshot isn't used here).
*/
Assert(snapshot);
PredicateLockRelation(relation, snapshot);
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
}
/* we only need to set this up once */
scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
/*
* Allocate memory to keep track of page allocation for parallel workers
* when doing a parallel scan.
*/
if (parallel_scan != NULL)
scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData));
else
scan->rs_parallelworkerdata = NULL;
/*
* we do this here instead of in initscan() because heap_rescan also calls
* initscan() and we don't want to allocate memory again
*/
if (nkeys > 0)
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
else
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
scan->rs_base.rs_key = NULL;
initscan(scan, key, false);
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
return (TableScanDesc) scan;
}
void
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
bool allow_strat, bool allow_sync, bool allow_pagemode)
{
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
HeapScanDesc scan = (HeapScanDesc) sscan;
if (set_params)
{
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (allow_strat)
scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
if (allow_sync)
scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
if (allow_pagemode && scan->rs_base.rs_snapshot &&
IsMVCCSnapshot(scan->rs_base.rs_snapshot))
scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
else
scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
}
/*
* unpin scan buffers
*/
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
/*
* reinitialize scan descriptor
*/
initscan(scan, key, true);
}
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
void
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
heap_endscan(TableScanDesc sscan)
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
{
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
HeapScanDesc scan = (HeapScanDesc) sscan;
Redesign tablesample method API, and do extensive code review. The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
2015-07-25 20:39:00 +02:00
/* Note: no locking manipulations needed */
/*
* unpin scan buffers
*/
if (BufferIsValid(scan->rs_cbuf))
ReleaseBuffer(scan->rs_cbuf);
/*
* decrement relation reference count and free scan descriptor storage
*/
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
RelationDecrementReferenceCount(scan->rs_base.rs_rd);
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
if (scan->rs_base.rs_key)
pfree(scan->rs_base.rs_key);
if (scan->rs_strategy != NULL)
FreeAccessStrategy(scan->rs_strategy);
if (scan->rs_parallelworkerdata != NULL)
pfree(scan->rs_parallelworkerdata);
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
UnregisterSnapshot(scan->rs_base.rs_snapshot);
pfree(scan);
}
HeapTuple
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
heap_getnext(TableScanDesc sscan, ScanDirection direction)
{
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
HeapScanDesc scan = (HeapScanDesc) sscan;
/*
* This is still widely used directly, without going through table AM, so
* add a safety check. It's possible we should, at a later point,
* downgrade this to an assert. The reason for checking the AM routine,
* rather than the AM oid, is that this allows to write regression tests
* that create another AM reusing the heap handler.
*/
if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg_internal("only heap AM is supported")));
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
Implement streaming mode in ReorderBuffer. Instead of serializing the transaction to disk after reaching the logical_decoding_work_mem limit in memory, we consume the changes we have in memory and invoke stream API methods added by commit 45fdc9738b. However, sometimes if we have incomplete toast or speculative insert we spill to the disk because we can't generate the complete tuple and stream. And, as soon as we get the complete tuple we stream the transaction including the serialized changes. We can do this incremental processing thanks to having assignments (associating subxact with toplevel xacts) in WAL right away, and thanks to logging the invalidation messages at each command end. These features are added by commits 0bead9af48 and c55040ccd0 respectively. Now that we can stream in-progress transactions, the concurrent aborts may cause failures when the output plugin consults catalogs (both system and user-defined). We handle such failures by returning ERRCODE_TRANSACTION_ROLLBACK sqlerrcode from system table scan APIs to the backend or WALSender decoding a specific uncommitted transaction. The decoding logic on the receipt of such a sqlerrcode aborts the decoding of the current transaction and continue with the decoding of other transactions. We have ReorderBufferTXN pointer in each ReorderBufferChange by which we know which xact it belongs to. The output plugin can use this to decide which changes to discard in case of stream_abort_cb (e.g. when a subxact gets discarded). We also provide a new option via SQL APIs to fetch the changes being streamed. Author: Dilip Kumar, Tomas Vondra, Amit Kapila, Nikhil Sontakke Reviewed-by: Amit Kapila, Kuntal Ghosh, Ajin Cherian Tested-by: Neha Sharma, Mahendra Singh Thalor and Ajin Cherian Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
2020-08-08 04:04:39 +02:00
/*
* We don't expect direct calls to heap_getnext with valid CheckXidAlive
* for catalog or regular tables. See detailed comments in xact.c where
* these variables are declared. Normally we have such a check at tableam
* level API but this is called from many places so we need to ensure it
* here.
*/
if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
elog(ERROR, "unexpected heap_getnext call during logical decoding");
/* Note: no locking manipulations needed */
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
heapgettup_pagemode(scan, direction,
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
else
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
heapgettup(scan, direction,
scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
if (scan->rs_ctup.t_data == NULL)
return NULL;
/*
* if we get here it means we have a new current scan tuple, so point to
* the proper return buffer and return the tuple.
*/
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
pgstat_count_heap_getnext(scan->rs_base.rs_rd);
return &scan->rs_ctup;
}
bool
heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{
HeapScanDesc scan = (HeapScanDesc) sscan;
/* Note: no locking manipulations needed */
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
else
Don't to predicate lock for analyze scans, refactor scan option passing. Before this commit, when ANALYZE was run on a table and serializable was used (either by virtue of an explicit BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE, or default_transaction_isolation being set to serializable) a null pointer dereference lead to a crash. The analyze scan doesn't need a snapshot (nor predicate locking), but before this commit a scan only contained information about being a bitmap or sample scan. Refactor the option passing to the scan_begin callback to use a bitmask instead. Alternatively we could have added a new boolean parameter, but that seems harder to read. Even before this issue various people (Heikki, Tom, Robert) suggested doing so. These changes don't change the scan APIs outside of tableam. The flags argument could be exposed, it's not necessary to fix this problem. Also the wrapper table_beginscan* functions encapsulate most of that complexity. After these changes fixing the bug is trivial, just don't acquire predicate lock for analyze style scans. That was already done for bitmap heap scans. Add an assert that a snapshot is passed when acquiring the predicate lock, so this kind of bug doesn't require running with serializable. Also add a comment about sample scans currently requiring predicate locking the entire relation, that previously wasn't remarked upon. Reported-By: Joe Wildish Author: Andres Freund Discussion: https://postgr.es/m/4EA80A20-E9BF-49F1-9F01-5B66CAB21453@elusive.cx https://postgr.es/m/20190411164947.nkii4gaeilt4bui7@alap3.anarazel.de https://postgr.es/m/20190518203102.g7peu2fianukjuxm@alap3.anarazel.de
2019-05-20 00:10:28 +02:00
heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
if (scan->rs_ctup.t_data == NULL)
{
ExecClearTuple(slot);
return false;
}
/*
* if we get here it means we have a new current scan tuple, so point to
* the proper return buffer and return the tuple.
*/
pgstat_count_heap_getnext(scan->rs_base.rs_rd);
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
scan->rs_cbuf);
return true;
}
void
heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid,
ItemPointer maxtid)
{
HeapScanDesc scan = (HeapScanDesc) sscan;
BlockNumber startBlk;
BlockNumber numBlks;
ItemPointerData highestItem;
ItemPointerData lowestItem;
/*
* For relations without any pages, we can simply leave the TID range
* unset. There will be no tuples to scan, therefore no tuples outside
* the given TID range.
*/
if (scan->rs_nblocks == 0)
return;
/*
* Set up some ItemPointers which point to the first and last possible
* tuples in the heap.
*/
ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
/*
* If the given maximum TID is below the highest possible TID in the
* relation, then restrict the range to that, otherwise we scan to the end
* of the relation.
*/
if (ItemPointerCompare(maxtid, &highestItem) < 0)
ItemPointerCopy(maxtid, &highestItem);
/*
* If the given minimum TID is above the lowest possible TID in the
* relation, then restrict the range to only scan for TIDs above that.
*/
if (ItemPointerCompare(mintid, &lowestItem) > 0)
ItemPointerCopy(mintid, &lowestItem);
/*
* Check for an empty range and protect from would be negative results
* from the numBlks calculation below.
*/
if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
{
/* Set an empty range of blocks to scan */
heap_setscanlimits(sscan, 0, 0);
return;
}
/*
* Calculate the first block and the number of blocks we must scan. We
* could be more aggressive here and perform some more validation to try
* and further narrow the scope of blocks to scan by checking if the
* lowestItem has an offset above MaxOffsetNumber. In this case, we could
* advance startBlk by one. Likewise, if highestItem has an offset of 0
* we could scan one fewer blocks. However, such an optimization does not
* seem worth troubling over, currently.
*/
startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
/* Set the start block and number of blocks to scan */
heap_setscanlimits(sscan, startBlk, numBlks);
/* Finally, set the TID range in sscan */
ItemPointerCopy(&lowestItem, &sscan->rs_mintid);
ItemPointerCopy(&highestItem, &sscan->rs_maxtid);
}
bool
heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction,
TupleTableSlot *slot)
{
HeapScanDesc scan = (HeapScanDesc) sscan;
ItemPointer mintid = &sscan->rs_mintid;
ItemPointer maxtid = &sscan->rs_maxtid;
/* Note: no locking manipulations needed */
for (;;)
{
if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
else
heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
if (scan->rs_ctup.t_data == NULL)
{
ExecClearTuple(slot);
return false;
}
/*
* heap_set_tidrange will have used heap_setscanlimits to limit the
* range of pages we scan to only ones that can contain the TID range
* we're scanning for. Here we must filter out any tuples from these
* pages that are outside of that range.
*/
if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
{
ExecClearTuple(slot);
/*
* When scanning backwards, the TIDs will be in descending order.
* Future tuples in this direction will be lower still, so we can
* just return false to indicate there will be no more tuples.
*/
if (ScanDirectionIsBackward(direction))
return false;
continue;
}
/*
* Likewise for the final page, we must filter out TIDs greater than
* maxtid.
*/
if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
{
ExecClearTuple(slot);
/*
* When scanning forward, the TIDs will be in ascending order.
* Future tuples in this direction will be higher still, so we can
* just return false to indicate there will be no more tuples.
*/
if (ScanDirectionIsForward(direction))
return false;
continue;
}
break;
}
/*
* if we get here it means we have a new current scan tuple, so point to
* the proper return buffer and return the tuple.
*/
pgstat_count_heap_getnext(scan->rs_base.rs_rd);
ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
return true;
}
/*
* heap_fetch - retrieve tuple with given tid
*
* On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
* the tuple, fill in the remaining fields of *tuple, and check the tuple
* against the specified snapshot.
*
* If successful (tuple found and passes snapshot time qual), then *userbuf
* is set to the buffer holding the tuple and true is returned. The caller
* must unpin the buffer when done with the tuple.
*
* If the tuple is not found (ie, item number references a deleted slot),
Prevent access to no-longer-pinned buffer in heapam_tuple_lock(). heap_fetch() used to have a "keep_buf" parameter that told it to return ownership of the buffer pin to the caller after finding that the requested tuple TID exists but is invisible to the specified snapshot. This was thoughtlessly removed in commit 5db6df0c0, which broke heapam_tuple_lock() (formerly EvalPlanQualFetch) because that function needs to do more accesses to the tuple even if it's invisible. The net effect is that we would continue to touch the page for a microsecond or two after releasing pin on the buffer. Usually no harm would result; but if a different session decided to defragment the page concurrently, we could see garbage data and mistakenly conclude that there's no newer tuple version to chain up to. (It's hard to say whether this has happened in the field. The bug was actually found thanks to a later change that allowed valgrind to detect accesses to non-pinned buffers.) The most reasonable way to fix this is to reintroduce keep_buf, although I made it behave slightly differently: buffer ownership is passed back only if there is a valid tuple at the requested TID. In HEAD, we can just add the parameter back to heap_fetch(). To avoid an API break in the back branches, introduce an additional function heap_fetch_extended() in those branches. In HEAD there is an additional, less obvious API change: tuple->t_data will be set to NULL in all cases where buffer ownership is not returned, in particular when the tuple exists but fails the time qual (and !keep_buf). This is to defend against any other callers attempting to access non-pinned buffers. We concluded that making that change in back branches would be more likely to introduce problems than cure any. In passing, remove a comment about heap_fetch that was obsoleted by 9a8ee1dc6. Per bug #17462 from Daniil Anisimov. Back-patch to v12 where the bug was introduced. Discussion: https://postgr.es/m/17462-9c98a0f00df9bd36@postgresql.org
2022-04-13 19:35:02 +02:00
* then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
* and false is returned.
*
Prevent access to no-longer-pinned buffer in heapam_tuple_lock(). heap_fetch() used to have a "keep_buf" parameter that told it to return ownership of the buffer pin to the caller after finding that the requested tuple TID exists but is invisible to the specified snapshot. This was thoughtlessly removed in commit 5db6df0c0, which broke heapam_tuple_lock() (formerly EvalPlanQualFetch) because that function needs to do more accesses to the tuple even if it's invisible. The net effect is that we would continue to touch the page for a microsecond or two after releasing pin on the buffer. Usually no harm would result; but if a different session decided to defragment the page concurrently, we could see garbage data and mistakenly conclude that there's no newer tuple version to chain up to. (It's hard to say whether this has happened in the field. The bug was actually found thanks to a later change that allowed valgrind to detect accesses to non-pinned buffers.) The most reasonable way to fix this is to reintroduce keep_buf, although I made it behave slightly differently: buffer ownership is passed back only if there is a valid tuple at the requested TID. In HEAD, we can just add the parameter back to heap_fetch(). To avoid an API break in the back branches, introduce an additional function heap_fetch_extended() in those branches. In HEAD there is an additional, less obvious API change: tuple->t_data will be set to NULL in all cases where buffer ownership is not returned, in particular when the tuple exists but fails the time qual (and !keep_buf). This is to defend against any other callers attempting to access non-pinned buffers. We concluded that making that change in back branches would be more likely to introduce problems than cure any. In passing, remove a comment about heap_fetch that was obsoleted by 9a8ee1dc6. Per bug #17462 from Daniil Anisimov. Back-patch to v12 where the bug was introduced. Discussion: https://postgr.es/m/17462-9c98a0f00df9bd36@postgresql.org
2022-04-13 19:35:02 +02:00
* If the tuple is found but fails the time qual check, then the behavior
* depends on the keep_buf parameter. If keep_buf is false, the results
* are the same as for the tuple-not-found case. If keep_buf is true,
* then tuple->t_data and *userbuf are returned as for the success case,
* and again the caller must unpin the buffer; but false is returned.
*
* heap_fetch does not follow HOT chains: only the exact TID requested will
* be fetched.
*
* It is somewhat inconsistent that we ereport() on invalid block number but
* return false on invalid item number. There are a couple of reasons though.
* One is that the caller can relatively easily check the block number for
* validity, but cannot check the item number without reading the page
* himself. Another is that when we are following a t_ctid link, we can be
* reasonably confident that the page number is valid (since VACUUM shouldn't
* truncate off the destination page without having killed the referencing
* tuple first), but the item number might well not be good.
*/
bool
heap_fetch(Relation relation,
1998-07-27 21:38:40 +02:00
Snapshot snapshot,
1998-11-27 20:52:36 +01:00
HeapTuple tuple,
Prevent access to no-longer-pinned buffer in heapam_tuple_lock(). heap_fetch() used to have a "keep_buf" parameter that told it to return ownership of the buffer pin to the caller after finding that the requested tuple TID exists but is invisible to the specified snapshot. This was thoughtlessly removed in commit 5db6df0c0, which broke heapam_tuple_lock() (formerly EvalPlanQualFetch) because that function needs to do more accesses to the tuple even if it's invisible. The net effect is that we would continue to touch the page for a microsecond or two after releasing pin on the buffer. Usually no harm would result; but if a different session decided to defragment the page concurrently, we could see garbage data and mistakenly conclude that there's no newer tuple version to chain up to. (It's hard to say whether this has happened in the field. The bug was actually found thanks to a later change that allowed valgrind to detect accesses to non-pinned buffers.) The most reasonable way to fix this is to reintroduce keep_buf, although I made it behave slightly differently: buffer ownership is passed back only if there is a valid tuple at the requested TID. In HEAD, we can just add the parameter back to heap_fetch(). To avoid an API break in the back branches, introduce an additional function heap_fetch_extended() in those branches. In HEAD there is an additional, less obvious API change: tuple->t_data will be set to NULL in all cases where buffer ownership is not returned, in particular when the tuple exists but fails the time qual (and !keep_buf). This is to defend against any other callers attempting to access non-pinned buffers. We concluded that making that change in back branches would be more likely to introduce problems than cure any. In passing, remove a comment about heap_fetch that was obsoleted by 9a8ee1dc6. Per bug #17462 from Daniil Anisimov. Back-patch to v12 where the bug was introduced. Discussion: https://postgr.es/m/17462-9c98a0f00df9bd36@postgresql.org
2022-04-13 19:35:02 +02:00
Buffer *userbuf,
bool keep_buf)
{
ItemPointer tid = &(tuple->t_self);
1998-11-27 20:52:36 +01:00
ItemId lp;
Buffer buffer;
Page page;
1998-11-27 20:52:36 +01:00
OffsetNumber offnum;
bool valid;
/*
* Fetch and pin the appropriate page of the relation.
*/
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
/*
* Need share lock on buffer to examine tuple commit status.
*/
LockBuffer(buffer, BUFFER_LOCK_SHARE);
page = BufferGetPage(buffer);
TestForOldSnapshot(snapshot, relation, page);
/*
* We'd better check for out-of-range offnum in case of VACUUM since the
* TID was obtained.
*/
offnum = ItemPointerGetOffsetNumber(tid);
if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
{
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
ReleaseBuffer(buffer);
*userbuf = InvalidBuffer;
tuple->t_data = NULL;
return false;
}
/*
* get the item line pointer corresponding to the requested tid
*/
lp = PageGetItemId(page, offnum);
/*
* Must check for deleted tuple.
*/
if (!ItemIdIsNormal(lp))
{
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
ReleaseBuffer(buffer);
*userbuf = InvalidBuffer;
tuple->t_data = NULL;
return false;
}
/*
* fill in *tuple fields
*/
tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1998-11-27 20:52:36 +01:00
tuple->t_len = ItemIdGetLength(lp);
tuple->t_tableOid = RelationGetRelid(relation);
1998-11-27 20:52:36 +01:00
/*
* check tuple visibility, then release lock
*/
valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
if (valid)
PredicateLockTID(relation, &(tuple->t_self), snapshot,
HeapTupleHeaderGetXmin(tuple->t_data));
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
if (valid)
{
/*
* All checks passed, so return the tuple as valid. Caller is now
* responsible for releasing the buffer.
*/
*userbuf = buffer;
return true;
}
Prevent access to no-longer-pinned buffer in heapam_tuple_lock(). heap_fetch() used to have a "keep_buf" parameter that told it to return ownership of the buffer pin to the caller after finding that the requested tuple TID exists but is invisible to the specified snapshot. This was thoughtlessly removed in commit 5db6df0c0, which broke heapam_tuple_lock() (formerly EvalPlanQualFetch) because that function needs to do more accesses to the tuple even if it's invisible. The net effect is that we would continue to touch the page for a microsecond or two after releasing pin on the buffer. Usually no harm would result; but if a different session decided to defragment the page concurrently, we could see garbage data and mistakenly conclude that there's no newer tuple version to chain up to. (It's hard to say whether this has happened in the field. The bug was actually found thanks to a later change that allowed valgrind to detect accesses to non-pinned buffers.) The most reasonable way to fix this is to reintroduce keep_buf, although I made it behave slightly differently: buffer ownership is passed back only if there is a valid tuple at the requested TID. In HEAD, we can just add the parameter back to heap_fetch(). To avoid an API break in the back branches, introduce an additional function heap_fetch_extended() in those branches. In HEAD there is an additional, less obvious API change: tuple->t_data will be set to NULL in all cases where buffer ownership is not returned, in particular when the tuple exists but fails the time qual (and !keep_buf). This is to defend against any other callers attempting to access non-pinned buffers. We concluded that making that change in back branches would be more likely to introduce problems than cure any. In passing, remove a comment about heap_fetch that was obsoleted by 9a8ee1dc6. Per bug #17462 from Daniil Anisimov. Back-patch to v12 where the bug was introduced. Discussion: https://postgr.es/m/17462-9c98a0f00df9bd36@postgresql.org
2022-04-13 19:35:02 +02:00
/* Tuple failed time qual, but maybe caller wants to see it anyway. */
if (keep_buf)
*userbuf = buffer;
else
{
ReleaseBuffer(buffer);
*userbuf = InvalidBuffer;
tuple->t_data = NULL;
}
return false;
}
/*
* heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
*
* On entry, *tid is the TID of a tuple (either a simple tuple, or the root
* of a HOT chain), and buffer is the buffer holding this tuple. We search
* for the first chain member satisfying the given snapshot. If one is
* found, we update *tid to reference that tuple's offset number, and
* return true. If no match, return false without modifying *tid.
*
* heapTuple is a caller-supplied buffer. When a match is found, we return
* the tuple here, in addition to updating *tid. If no match is found, the
* contents of this buffer on return are undefined.
*
* If all_dead is not NULL, we check non-visible tuples to see if they are
* globally dead; *all_dead is set true if all members of the HOT chain
* are vacuumable, false if not.
*
* Unlike heap_fetch, the caller must already have pin and (at least) share
Prevent access to no-longer-pinned buffer in heapam_tuple_lock(). heap_fetch() used to have a "keep_buf" parameter that told it to return ownership of the buffer pin to the caller after finding that the requested tuple TID exists but is invisible to the specified snapshot. This was thoughtlessly removed in commit 5db6df0c0, which broke heapam_tuple_lock() (formerly EvalPlanQualFetch) because that function needs to do more accesses to the tuple even if it's invisible. The net effect is that we would continue to touch the page for a microsecond or two after releasing pin on the buffer. Usually no harm would result; but if a different session decided to defragment the page concurrently, we could see garbage data and mistakenly conclude that there's no newer tuple version to chain up to. (It's hard to say whether this has happened in the field. The bug was actually found thanks to a later change that allowed valgrind to detect accesses to non-pinned buffers.) The most reasonable way to fix this is to reintroduce keep_buf, although I made it behave slightly differently: buffer ownership is passed back only if there is a valid tuple at the requested TID. In HEAD, we can just add the parameter back to heap_fetch(). To avoid an API break in the back branches, introduce an additional function heap_fetch_extended() in those branches. In HEAD there is an additional, less obvious API change: tuple->t_data will be set to NULL in all cases where buffer ownership is not returned, in particular when the tuple exists but fails the time qual (and !keep_buf). This is to defend against any other callers attempting to access non-pinned buffers. We concluded that making that change in back branches would be more likely to introduce problems than cure any. In passing, remove a comment about heap_fetch that was obsoleted by 9a8ee1dc6. Per bug #17462 from Daniil Anisimov. Back-patch to v12 where the bug was introduced. Discussion: https://postgr.es/m/17462-9c98a0f00df9bd36@postgresql.org
2022-04-13 19:35:02 +02:00
* lock on the buffer; it is still pinned/locked at exit.
*/
bool
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
Snapshot snapshot, HeapTuple heapTuple,
bool *all_dead, bool first_call)
{
Page page = BufferGetPage(buffer);
TransactionId prev_xmax = InvalidTransactionId;
BlockNumber blkno;
OffsetNumber offnum;
bool at_chain_start;
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
bool valid;
bool skip;
snapshot scalability: Don't compute global horizons while building snapshots. To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
2020-08-13 01:03:49 +02:00
GlobalVisState *vistest = NULL;
/* If this is not the first call, previous call returned a (live!) tuple */
if (all_dead)
*all_dead = first_call;
blkno = ItemPointerGetBlockNumber(tid);
offnum = ItemPointerGetOffsetNumber(tid);
at_chain_start = first_call;
skip = !first_call;
snapshot scalability: Don't compute global horizons while building snapshots. To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
2020-08-13 01:03:49 +02:00
/* XXX: we should assert that a snapshot is pushed or registered */
Assert(TransactionIdIsValid(RecentXmin));
Assert(BufferGetBlockNumber(buffer) == blkno);
/* Scan through possible multiple members of HOT-chain */
for (;;)
{
ItemId lp;
/* check for bogus TID */
if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
break;
lp = PageGetItemId(page, offnum);
/* check for unused, dead, or redirected items */
if (!ItemIdIsNormal(lp))
{
/* We should only see a redirect at start of chain */
if (ItemIdIsRedirected(lp) && at_chain_start)
{
/* Follow the redirect */
offnum = ItemIdGetRedirect(lp);
at_chain_start = false;
continue;
}
/* else must be end of chain */
break;
}
/*
* Update heapTuple to point to the element of the HOT chain we're
* currently investigating. Having t_self set correctly is important
* because the SSI checks and the *Satisfies routine for historical
* MVCC snapshots need the correct tid to decide about the visibility.
*/
heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
heapTuple->t_len = ItemIdGetLength(lp);
heapTuple->t_tableOid = RelationGetRelid(relation);
ItemPointerSet(&heapTuple->t_self, blkno, offnum);
/*
* Shouldn't see a HEAP_ONLY tuple at chain start.
*/
if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
break;
/*
* The xmin should match the previous xmax value, else chain is
* broken.
*/
if (TransactionIdIsValid(prev_xmax) &&
!TransactionIdEquals(prev_xmax,
HeapTupleHeaderGetXmin(heapTuple->t_data)))
break;
/*
* When first_call is true (and thus, skip is initially false) we'll
* return the first tuple we find. But on later passes, heapTuple
* will initially be pointing to the tuple we returned last time.
* Returning it again would be incorrect (and would loop forever), so
* we skip it and return the next match we find.
*/
if (!skip)
{
/* If it's visible per the snapshot, we must return it */
valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
buffer, snapshot);
if (valid)
{
ItemPointerSetOffsetNumber(tid, offnum);
PredicateLockTID(relation, &heapTuple->t_self, snapshot,
HeapTupleHeaderGetXmin(heapTuple->t_data));
if (all_dead)
*all_dead = false;
return true;
}
}
skip = false;
/*
* If we can't see it, maybe no one else can either. At caller
* request, check whether all chain members are dead to all
* transactions.
Improve performance of get_actual_variable_range with recently-dead tuples. In commit fccebe421, we hacked get_actual_variable_range() to scan the index with SnapshotDirty, so that if there are many uncommitted tuples at the end of the index range, it wouldn't laboriously scan through all of them looking for a live value to return. However, that didn't fix it for the case of many recently-dead tuples at the end of the index; SnapshotDirty recognizes those as committed dead and so we're back to the same problem. To improve the situation, invent a "SnapshotNonVacuumable" snapshot type and use that instead. The reason this helps is that, if the snapshot rejects a given index entry, we know that the indexscan will mark that index entry as killed. This means the next get_actual_variable_range() scan will proceed past that entry without visiting the heap, making the scan a lot faster. We may end up accepting a recently-dead tuple as being the estimated extremal value, but that doesn't seem much worse than the compromise we made before to accept not-yet-committed extremal values. The cost of the scan is still proportional to the number of dead index entries at the end of the range, so in the interval after a mass delete but before VACUUM's cleaned up the mess, it's still possible for get_actual_variable_range() to take a noticeable amount of time, if you've got enough such dead entries. But the constant factor is much much better than before, since all we need to do with each index entry is test its "killed" bit. We chose to back-patch commit fccebe421 at the time, but I'm hesitant to do so here, because this form of the problem seems to affect many fewer people. Also, even when it happens, it's less bad than the case fixed by commit fccebe421 because we don't get the contention effects from expensive TransactionIdIsInProgress tests. Dmitriy Sarafannikov, reviewed by Andrey Borodin Discussion: https://postgr.es/m/05C72CF7-B5F6-4DB9-8A09-5AC897653113@yandex.ru
2017-09-08 01:41:51 +02:00
*
* Note: if you change the criterion here for what is "dead", fix the
* planner's get_actual_variable_range() function to match.
*/
snapshot scalability: Don't compute global horizons while building snapshots. To make GetSnapshotData() more scalable, it cannot not look at at each proc's xmin: While snapshot contents do not need to change whenever a read-only transaction commits or a snapshot is released, a proc's xmin is modified in those cases. The frequency of xmin modifications leads to, particularly on higher core count systems, many cache misses inside GetSnapshotData(), despite the data underlying a snapshot not changing. That is the most significant source of GetSnapshotData() scaling poorly on larger systems. Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons / thresholds as it has so far. But we don't really have to: The horizons don't actually change that much between GetSnapshotData() calls. Nor are the horizons actually used every time a snapshot is built. The trick this commit introduces is to delay computation of accurate horizons until there use and using horizon boundaries to determine whether accurate horizons need to be computed. The use of RecentGlobal[Data]Xmin to decide whether a row version could be removed has been replaces with new GlobalVisTest* functions. These use two thresholds to determine whether a row can be pruned: 1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed are definitely still visible. 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can definitely be removed GetSnapshotData() updates definitely_needed to be the xmin of the computed snapshot. When testing whether a row can be removed (with GlobalVisTestIsRemovableXid()) and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID < definitely_needed) the boundaries can be recomputed to be more accurate. As it is not cheap to compute accurate boundaries, we limit the number of times that happens in short succession. As the boundaries used by GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by GetSnapshotData()), it is likely that further test can benefit from an earlier computation of accurate horizons. To avoid regressing performance when old_snapshot_threshold is set (as that requires an accurate horizon to be computed), heap_page_prune_opt() doesn't unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the computation of the limited horizon, and the triggering of errors (with SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove tuples. This commit just removes the accesses to PGXACT->xmin from GetSnapshotData(), but other members of PGXACT residing in the same cache line are accessed. Therefore this in itself does not result in a significant improvement. Subsequent commits will take advantage of the fact that GetSnapshotData() now does not need to access xmins anymore. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. Author: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Reviewed-By: Thomas Munro <thomas.munro@gmail.com> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
2020-08-13 01:03:49 +02:00
if (all_dead && *all_dead)
{
if (!vistest)
vistest = GlobalVisTestFor(relation);
if (!HeapTupleIsSurelyDead(heapTuple, vistest))
*all_dead = false;
}
/*
* Check to see if HOT chain continues past this tuple; if so fetch
* the next offnum and loop around.
*/
if (HeapTupleIsHotUpdated(heapTuple))
{
Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
blkno);
offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
at_chain_start = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
}
else
break; /* end of chain */
}
return false;
}
/*
* heap_get_latest_tid - get the latest tid of a specified tuple
*
* Actually, this gets the latest version that is visible according to the
* scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
* possibly uncommitted version.
*
* *tid is both an input and an output parameter: it is updated to
* show the latest version of the row. Note that it will not be changed
* if no version of the row passes the snapshot test.
*/
void
heap_get_latest_tid(TableScanDesc sscan,
ItemPointer tid)
{
Relation relation = sscan->rs_rd;
Snapshot snapshot = sscan->rs_snapshot;
ItemPointerData ctid;
TransactionId priorXmax;
/*
* table_tuple_get_latest_tid() verified that the passed in tid is valid.
* Assume that t_ctid links are valid however - there shouldn't be invalid
* ones in the table.
*/
Assert(ItemPointerIsValid(tid));
/*
* Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
* need to examine, and *tid is the TID we will return if ctid turns out
* to be bogus.
*
* Note that we will loop until we reach the end of the t_ctid chain.
* Depending on the snapshot passed, there might be at most one visible
* version of the row, but we don't try to optimize for that.
*/
ctid = *tid;
priorXmax = InvalidTransactionId; /* cannot check first XMIN */
for (;;)
{
Buffer buffer;
Page page;
OffsetNumber offnum;
ItemId lp;
HeapTupleData tp;
bool valid;
/*
* Read, pin, and lock the page.
*/
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
LockBuffer(buffer, BUFFER_LOCK_SHARE);
page = BufferGetPage(buffer);
TestForOldSnapshot(snapshot, relation, page);
/*
* Check for bogus item number. This is not treated as an error
* condition because it can happen while following a t_ctid link. We
* just assume that the prior tid is OK and return it unchanged.
*/
offnum = ItemPointerGetOffsetNumber(&ctid);
if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
{
UnlockReleaseBuffer(buffer);
break;
}
lp = PageGetItemId(page, offnum);
if (!ItemIdIsNormal(lp))
{
UnlockReleaseBuffer(buffer);
break;
}
/* OK to access the tuple */
tp.t_self = ctid;
tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
tp.t_len = ItemIdGetLength(lp);
tp.t_tableOid = RelationGetRelid(relation);
/*
* After following a t_ctid link, we might arrive at an unrelated
* tuple. Check for XMIN match.
*/
if (TransactionIdIsValid(priorXmax) &&
!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
{
UnlockReleaseBuffer(buffer);
break;
}
/*
* Check tuple visibility; if visible, set it as the new result
* candidate.
*/
valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
if (valid)
*tid = ctid;
/*
* If there's a valid t_ctid link, follow it, else we're done.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
2018-04-07 22:24:10 +02:00
HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
{
UnlockReleaseBuffer(buffer);
break;
}
ctid = tp.t_data->t_ctid;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
UnlockReleaseBuffer(buffer);
} /* end of loop */
}
/*
* UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
*
* This is called after we have waited for the XMAX transaction to terminate.
* If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
* be set on exit. If the transaction committed, we set the XMAX_COMMITTED
* hint bit if possible --- but beware that that may not yet be possible,
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* if the transaction committed asynchronously.
*
* Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
* even if it commits.
*
* Hence callers should look only at XMAX_INVALID.
*
* Note this is not allowed for tuples whose xmax is a multixact.
*/
static void
UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
TransactionIdDidCommit(xid))
HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
xid);
else
HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
InvalidTransactionId);
}
}
/*
* GetBulkInsertState - prepare status object for a bulk insert
*/
BulkInsertState
GetBulkInsertState(void)
{
BulkInsertState bistate;
bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
bistate->current_buf = InvalidBuffer;
hio: Use ExtendBufferedRelBy() to extend tables more efficiently While we already had some form of bulk extension for relations, it was fairly limited. It only amortized the cost of acquiring the extension lock, the relation itself was still extended one-by-one. Bulk extension was also solely triggered by contention, not by the amount of data inserted. To address this, use ExtendBufferedRelBy(), introduced in 31966b151e6, to extend the relation. We try to extend the relation by multiple blocks in two situations: 1) The caller tells RelationGetBufferForTuple() that it will need multiple pages. For now that's only used by heap_multi_insert(), see commit FIXME. 2) If there is contention on the extension lock, use the number of waiters for the lock as a multiplier for the number of blocks to extend by. This is similar to what we already did. Previously we additionally multiplied the numbers of waiters by 20, but with the new relation extension infrastructure I could not see a benefit in doing so. Using the freespacemap to provide empty pages can cause significant contention, and adds measurable overhead, even if there is no contention. To reduce that, remember the blocks the relation was extended by in the BulkInsertState, in the extending backend. In case 1) from above, the blocks the extending backend needs are not entered into the FSM, as we know that we will need those blocks. One complication with using the FSM to record empty pages, is that we need to insert blocks into the FSM, when we already hold a buffer content lock. To avoid doing IO while holding a content lock, release the content lock before recording free space. Currently that opens a small window in which another backend could fill the block, if a concurrent VACUUM records the free space. If that happens, we retry, similar to the already existing case when otherBuffer is provided. In the future it might be worth closing the race by preventing VACUUM from recording the space in newly extended pages. This change provides very significant wins (3x at 16 clients, on my workstation) for concurrent COPY into a single relation. Even single threaded COPY is measurably faster, primarily due to not dirtying pages while extending, if supported by the operating system (see commit 4d330a61bb1). Even single-row INSERTs benefit, although to a much smaller degree, as the relation extension lock rarely is the primary bottleneck. Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
2023-04-07 01:35:21 +02:00
bistate->next_free = InvalidBlockNumber;
bistate->last_free = InvalidBlockNumber;
return bistate;
}
/*
* FreeBulkInsertState - clean up after finishing a bulk insert
*/
void
FreeBulkInsertState(BulkInsertState bistate)
{
if (bistate->current_buf != InvalidBuffer)
ReleaseBuffer(bistate->current_buf);
FreeAccessStrategy(bistate->strategy);
pfree(bistate);
}
/*
* ReleaseBulkInsertStatePin - release a buffer currently held in bistate
*/
void
ReleaseBulkInsertStatePin(BulkInsertState bistate)
{
if (bistate->current_buf != InvalidBuffer)
ReleaseBuffer(bistate->current_buf);
bistate->current_buf = InvalidBuffer;
}
/*
* heap_insert - insert tuple into a heap
*
* The new tuple is stamped with current transaction ID and the specified
* command ID.
*
* See table_tuple_insert for comments about most of the input flags, except
* that this routine directly takes a tuple rather than a slot.
Fix logical decoding error when system table w/ toast is repeatedly rewritten. Repeatedly rewriting a mapped catalog table with VACUUM FULL or CLUSTER could cause logical decoding to fail with: ERROR, "could not map filenode \"%s\" to relation OID" To trigger the problem the rewritten catalog had to have live tuples with toasted columns. The problem was triggered as during catalog table rewrites the heap_insert() check that prevents logical decoding information to be emitted for system catalogs, failed to treat the new heap's toast table as a system catalog (because the new heap is not recognized as a catalog table via RelationIsLogicallyLogged()). The relmapper, in contrast to the normal catalog contents, does not contain historical information. After a single rewrite of a mapped table the new relation is known to the relmapper, but if the table is rewritten twice before logical decoding occurs, the relfilenode cannot be mapped to a relation anymore. Which then leads us to error out. This only happens for toast tables, because the main table contents aren't re-inserted with heap_insert(). The fix is simple, add a new heap_insert() flag that prevents logical decoding information from being emitted, and accept during decoding that there might not be tuple data for toast tables. Unfortunately that does not fix pre-existing logical decoding errors. Doing so would require not throwing an error when a filenode cannot be mapped to a relation during decoding, and that seems too likely to hide bugs. If it's crucial to fix decoding for an existing slot, temporarily changing the ERROR in ReorderBufferCommit() to a WARNING appears to be the best fix. Author: Andres Freund Discussion: https://postgr.es/m/20180914021046.oi7dm4ra3ot2g2kt@alap3.anarazel.de Backpatch: 9.4-, where logical decoding was introduced
2018-10-10 22:53:02 +02:00
*
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
* options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
* implement table_tuple_insert_speculative().
*
* On return the header fields of *tup are updated to match the stored tuple;
* in particular tup->t_self receives the actual TID where the tuple was
* stored. But note that any toasting of fields within the tuple data is NOT
* reflected into *tup.
*/
Remove WITH OIDS support, change oid catalog column visibility. Previously tables declared WITH OIDS, including a significant fraction of the catalog tables, stored the oid column not as a normal column, but as part of the tuple header. This special column was not shown by default, which was somewhat odd, as it's often (consider e.g. pg_class.oid) one of the more important parts of a row. Neither pg_dump nor COPY included the contents of the oid column by default. The fact that the oid column was not an ordinary column necessitated a significant amount of special case code to support oid columns. That already was painful for the existing, but upcoming work aiming to make table storage pluggable, would have required expanding and duplicating that "specialness" significantly. WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0). Remove it. Removing includes: - CREATE TABLE and ALTER TABLE syntax for declaring the table to be WITH OIDS has been removed (WITH (oids[ = true]) will error out) - pg_dump does not support dumping tables declared WITH OIDS and will issue a warning when dumping one (and ignore the oid column). - restoring an pg_dump archive with pg_restore will warn when restoring a table with oid contents (and ignore the oid column) - COPY will refuse to load binary dump that includes oids. - pg_upgrade will error out when encountering tables declared WITH OIDS, they have to be altered to remove the oid column first. - Functionality to access the oid of the last inserted row (like plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed. The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false) for CREATE TABLE) is still supported. While that requires a bit of support code, it seems unnecessary to break applications / dumps that do not use oids, and are explicit about not using them. The biggest user of WITH OID columns was postgres' catalog. This commit changes all 'magic' oid columns to be columns that are normally declared and stored. To reduce unnecessary query breakage all the newly added columns are still named 'oid', even if a table's column naming scheme would indicate 'reloid' or such. This obviously requires adapting a lot code, mostly replacing oid access via HeapTupleGetOid() with access to the underlying Form_pg_*->oid column. The bootstrap process now assigns oids for all oid columns in genbki.pl that do not have an explicit value (starting at the largest oid previously used), only oids assigned later by oids will be above FirstBootstrapObjectId. As the oid column now is a normal column the special bootstrap syntax for oids has been removed. Oids are not automatically assigned during insertion anymore, all backend code explicitly assigns oids with GetNewOidWithIndex(). For the rare case that insertions into the catalog via SQL are called for the new pg_nextoid() function can be used (which only works on catalog tables). The fact that oid columns on system tables are now normal columns means that they will be included in the set of columns expanded by * (i.e. SELECT * FROM pg_class will now include the table's oid, previously it did not). It'd not technically be hard to hide oid column by default, but that'd mean confusing behavior would either have to be carried forward forever, or it'd cause breakage down the line. While it's not unlikely that further adjustments are needed, the scope/invasiveness of the patch makes it worthwhile to get merge this now. It's painful to maintain externally, too complicated to commit after the code code freeze, and a dependency of a number of other patches. Catversion bump, for obvious reasons. Author: Andres Freund, with contributions by John Naylor Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
void
heap_insert(Relation relation, HeapTuple tup, CommandId cid,
int options, BulkInsertState bistate)
{
TransactionId xid = GetCurrentTransactionId();
HeapTuple heaptup;
2000-07-03 04:54:21 +02:00
Buffer buffer;
Buffer vmbuffer = InvalidBuffer;
bool all_visible_cleared = false;
2000-07-03 04:54:21 +02:00
Fix mishandling of resjunk columns in ON CONFLICT ... UPDATE tlists. It's unusual to have any resjunk columns in an ON CONFLICT ... UPDATE list, but it can happen when MULTIEXPR_SUBLINK SubPlans are present. If it happens, the ON CONFLICT UPDATE code path would end up storing tuples that include the values of the extra resjunk columns. That's fairly harmless in the short run, but if new columns are added to the table then the values would become accessible, possibly leading to malfunctions if they don't match the datatypes of the new columns. This had escaped notice through a confluence of missing sanity checks, including * There's no cross-check that a tuple presented to heap_insert or heap_update matches the table rowtype. While it's difficult to check that fully at reasonable cost, we can easily add assertions that there aren't too many columns. * The output-column-assignment cases in execExprInterp.c lacked any sanity checks on the output column numbers, which seems like an oversight considering there are plenty of assertion checks on input column numbers. Add assertions there too. * We failed to apply nodeModifyTable's ExecCheckPlanOutput() to the ON CONFLICT UPDATE tlist. That wouldn't have caught this specific error, since that function is chartered to ignore resjunk columns; but it sure seems like a bad omission now that we've seen this bug. In HEAD, the right way to fix this is to make the processing of ON CONFLICT UPDATE tlists work the same as regular UPDATE tlists now do, that is don't add "SET x = x" entries, and use ExecBuildUpdateProjection to evaluate the tlist and combine it with old values of the not-set columns. This adds a little complication to ExecBuildUpdateProjection, but allows removal of a comparable amount of now-dead code from the planner. In the back branches, the most expedient solution seems to be to (a) use an output slot for the ON CONFLICT UPDATE projection that actually matches the target table, and then (b) invent a variant of ExecBuildProjectionInfo that can be told to not store values resulting from resjunk columns, so it doesn't try to store into nonexistent columns of the output slot. (We can't simply ignore the resjunk columns altogether; they have to be evaluated for MULTIEXPR_SUBLINK to work.) This works back to v10. In 9.6, projections work much differently and we can't cheaply give them such an option. The 9.6 version of this patch works by inserting a JunkFilter when it's necessary to get rid of resjunk columns. In addition, v11 and up have the reverse problem when trying to perform ON CONFLICT UPDATE on a partitioned table. Through a further oversight, adjust_partition_tlist() discarded resjunk columns when re-ordering the ON CONFLICT UPDATE tlist to match a partition. This accidentally prevented the storing-bogus-tuples problem, but at the cost that MULTIEXPR_SUBLINK cases didn't work, typically crashing if more than one row has to be updated. Fix by preserving resjunk columns in that routine. (I failed to resist the temptation to add more assertions there too, and to do some minor code beautification.) Per report from Andres Freund. Back-patch to all supported branches. Security: CVE-2021-32028
2021-05-10 17:02:29 +02:00
/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
RelationGetNumberOfAttributes(relation));
/*
* Fill in tuple header fields and toast the tuple if necessary.
*
* Note: below this point, heaptup is the data we actually intend to store
* into the relation; tup is the caller's original untoasted data.
*/
heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
/*
* Find buffer to insert this tuple into. If the page is all visible,
* this will also pin the requisite visibility map page.
*/
buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
InvalidBuffer, options, bistate,
&vmbuffer, NULL,
0);
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
/*
* We're about to do the actual insert -- but check for conflict first, to
* avoid possibly having to roll back work we've just done.
*
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* This is safe without a recheck as long as there is no possibility of
* another process scanning the page between this check and the insert
* being visible to the scan (i.e., an exclusive buffer content lock is
* continuously held from this point until the tuple insert is visible).
*
* For a heap insert, we only need to check for table-level SSI locks. Our
* new tuple can't possibly conflict with existing tuple locks, and heap
* page locks are only consolidated versions of tuple locks; they do not
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* lock "gaps" as index page locks do. So we don't need to specify a
* buffer when making the call, which makes for a faster check.
*/
CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
RelationPutHeapTuple(relation, buffer, heaptup,
(options & HEAP_INSERT_SPECULATIVE) != 0);
if (PageIsAllVisible(BufferGetPage(buffer)))
{
all_visible_cleared = true;
PageClearAllVisible(BufferGetPage(buffer));
visibilitymap_clear(relation,
ItemPointerGetBlockNumber(&(heaptup->t_self)),
vmbuffer, VISIBILITYMAP_VALID_BITS);
}
/*
* XXX Should we set PageSetPrunable on this page ?
*
* The inserting transaction may eventually abort thus making this tuple
* DEAD and hence available for pruning. Though we don't want to optimize
* for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
* aborted tuple will never be pruned until next vacuum is triggered.
*
* If you do add PageSetPrunable here, add it in heap_xlog_insert too.
*/
MarkBufferDirty(buffer);
2000-06-02 12:20:27 +02:00
/* XLOG stuff */
Skip WAL for new relfilenodes, under wal_level=minimal. Until now, only selected bulk operations (e.g. COPY) did this. If a given relfilenode received both a WAL-skipping COPY and a WAL-logged operation (e.g. INSERT), recovery could lose tuples from the COPY. See src/backend/access/transam/README section "Skipping WAL for New RelFileNode" for the new coding rules. Maintainers of table access methods should examine that section. To maintain data durability, just before commit, we choose between an fsync of the relfilenode and copying its contents to WAL. A new GUC, wal_skip_threshold, guides that choice. If this change slows a workload that creates small, permanent relfilenodes under wal_level=minimal, try adjusting wal_skip_threshold. Users setting a timeout on COMMIT may need to adjust that timeout, and log_min_duration_statement analysis will reflect time consumption moving to COMMIT from commands like COPY. Internally, this requires a reliable determination of whether RollbackAndReleaseCurrentSubTransaction() would unlink a relation's current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the specification of rd_createSubid such that the field is zero when a new rel has an old rd_node. Make relcache.c retain entries for certain dropped relations until end of transaction. Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN. Future servers accept older WAL, so this bump is discretionary. Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert Haas. Heikki Linnakangas and Michael Paquier implemented earlier designs that materially clarified the problem. Reviewed, in earlier designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane, Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout. Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
if (RelationNeedsWAL(relation))
2000-06-02 12:20:27 +02:00
{
xl_heap_insert xlrec;
xl_heap_header xlhdr;
XLogRecPtr recptr;
Page page = BufferGetPage(buffer);
uint8 info = XLOG_HEAP_INSERT;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
int bufflags = 0;
/*
* If this is a catalog, we need to transmit combo CIDs to properly
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* decode, so log that as well.
*/
if (RelationIsAccessibleInLogicalDecoding(relation))
log_heap_new_cid(relation, heaptup);
/*
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* If this is the single and first tuple on page, we can reinit the
* page instead of restoring the whole thing. Set flag, and hide
* buffer references from XLogInsert.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
{
info |= XLOG_HEAP_INIT_PAGE;
bufflags |= REGBUF_WILL_INIT;
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags = 0;
if (all_visible_cleared)
xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
if (options & HEAP_INSERT_SPECULATIVE)
xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
/*
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* For logical decoding, we need the tuple even if we're doing a full
* page write, so make sure it's included even if we take a full-page
* image. (XXX We could alternatively store a pointer into the FPW).
*/
Fix logical decoding error when system table w/ toast is repeatedly rewritten. Repeatedly rewriting a mapped catalog table with VACUUM FULL or CLUSTER could cause logical decoding to fail with: ERROR, "could not map filenode \"%s\" to relation OID" To trigger the problem the rewritten catalog had to have live tuples with toasted columns. The problem was triggered as during catalog table rewrites the heap_insert() check that prevents logical decoding information to be emitted for system catalogs, failed to treat the new heap's toast table as a system catalog (because the new heap is not recognized as a catalog table via RelationIsLogicallyLogged()). The relmapper, in contrast to the normal catalog contents, does not contain historical information. After a single rewrite of a mapped table the new relation is known to the relmapper, but if the table is rewritten twice before logical decoding occurs, the relfilenode cannot be mapped to a relation anymore. Which then leads us to error out. This only happens for toast tables, because the main table contents aren't re-inserted with heap_insert(). The fix is simple, add a new heap_insert() flag that prevents logical decoding information from being emitted, and accept during decoding that there might not be tuple data for toast tables. Unfortunately that does not fix pre-existing logical decoding errors. Doing so would require not throwing an error when a filenode cannot be mapped to a relation during decoding, and that seems too likely to hide bugs. If it's crucial to fix decoding for an existing slot, temporarily changing the ERROR in ReorderBufferCommit() to a WARNING appears to be the best fix. Author: Andres Freund Discussion: https://postgr.es/m/20180914021046.oi7dm4ra3ot2g2kt@alap3.anarazel.de Backpatch: 9.4-, where logical decoding was introduced
2018-10-10 22:53:02 +02:00
if (RelationIsLogicallyLogged(relation) &&
!(options & HEAP_INSERT_NO_LOGICAL))
{
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
bufflags |= REGBUF_KEEP_DATA;
Implement streaming mode in ReorderBuffer. Instead of serializing the transaction to disk after reaching the logical_decoding_work_mem limit in memory, we consume the changes we have in memory and invoke stream API methods added by commit 45fdc9738b. However, sometimes if we have incomplete toast or speculative insert we spill to the disk because we can't generate the complete tuple and stream. And, as soon as we get the complete tuple we stream the transaction including the serialized changes. We can do this incremental processing thanks to having assignments (associating subxact with toplevel xacts) in WAL right away, and thanks to logging the invalidation messages at each command end. These features are added by commits 0bead9af48 and c55040ccd0 respectively. Now that we can stream in-progress transactions, the concurrent aborts may cause failures when the output plugin consults catalogs (both system and user-defined). We handle such failures by returning ERRCODE_TRANSACTION_ROLLBACK sqlerrcode from system table scan APIs to the backend or WALSender decoding a specific uncommitted transaction. The decoding logic on the receipt of such a sqlerrcode aborts the decoding of the current transaction and continue with the decoding of other transactions. We have ReorderBufferTXN pointer in each ReorderBufferChange by which we know which xact it belongs to. The output plugin can use this to decide which changes to discard in case of stream_abort_cb (e.g. when a subxact gets discarded). We also provide a new option via SQL APIs to fetch the changes being streamed. Author: Dilip Kumar, Tomas Vondra, Amit Kapila, Nikhil Sontakke Reviewed-by: Amit Kapila, Kuntal Ghosh, Ajin Cherian Tested-by: Neha Sharma, Mahendra Singh Thalor and Ajin Cherian Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
2020-08-08 04:04:39 +02:00
if (IsToastRelation(relation))
xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION;
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
xlhdr.t_infomask = heaptup->t_data->t_infomask;
xlhdr.t_hoff = heaptup->t_data->t_hoff;
/*
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* note we mark xlhdr as belonging to buffer; if XLogInsert decides to
* write the whole page to the xlog, we don't need to store
* xl_heap_header in the xlog.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
XLogRegisterBufData(0,
(char *) heaptup->t_data + SizeofHeapTupleHeader,
heaptup->t_len - SizeofHeapTupleHeader);
2000-06-02 12:20:27 +02:00
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/* filtering by origin on a row level is much more efficient */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP_ID, info);
PageSetLSN(page, recptr);
2000-06-02 12:20:27 +02:00
}
END_CRIT_SECTION();
2000-06-02 12:20:27 +02:00
UnlockReleaseBuffer(buffer);
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
/*
* If tuple is cachable, mark it for invalidation from the caches in case
* we abort. Note it is OK to do this after releasing the buffer, because
* the heaptup data structure is all in local memory, not in the shared
* buffer.
*/
CacheInvalidateHeapTuple(relation, heaptup, NULL);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
/* Note: speculative insertions are counted too, even if aborted later */
pgstat_count_heap_insert(relation, 1);
/*
* If heaptup is a private copy, release it. Don't forget to copy t_self
* back to the caller's image, too.
*/
if (heaptup != tup)
{
tup->t_self = heaptup->t_self;
heap_freetuple(heaptup);
}
}
/*
* Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
* tuple header fields and toasts the tuple if necessary. Returns a toasted
* version of the tuple if it was toasted, or the original tuple if not. Note
* that in any case, the header fields are also set in the original tuple.
*/
static HeapTuple
heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
CommandId cid, int options)
{
/*
* To allow parallel inserts, we need to ensure that they are safe to be
* performed in workers. We have the infrastructure to allow parallel
* inserts in general except for the cases where inserts generate a new
* CommandId (eg. inserts into a table having a foreign key column).
*/
if (IsParallelWorker())
ereport(ERROR,
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
errmsg("cannot insert tuples in a parallel worker")));
tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
HeapTupleHeaderSetXmin(tup->t_data, xid);
if (options & HEAP_INSERT_FROZEN)
HeapTupleHeaderSetXminFrozen(tup->t_data);
HeapTupleHeaderSetCmin(tup->t_data, cid);
HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
tup->t_tableOid = RelationGetRelid(relation);
/*
* If the new tuple is too big for storage or contains already toasted
* out-of-line attributes from some other relation, invoke the toaster.
*/
if (relation->rd_rel->relkind != RELKIND_RELATION &&
relation->rd_rel->relkind != RELKIND_MATVIEW)
{
/* toast table entries should never be recursively toasted */
Assert(!HeapTupleHasExternal(tup));
return tup;
}
else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
return heap_toast_insert_or_update(relation, tup, NULL, options);
else
return tup;
}
/*
* Helper for heap_multi_insert() that computes the number of entire pages
* that inserting the remaining heaptuples requires. Used to determine how
* much the relation needs to be extended by.
*/
static int
heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
{
size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
int npages = 1;
for (int i = done; i < ntuples; i++)
{
size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
if (page_avail < tup_sz)
{
npages++;
page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
}
page_avail -= tup_sz;
}
return npages;
}
/*
* heap_multi_insert - insert multiple tuples into a heap
*
* This is like heap_insert(), but inserts multiple tuples in one operation.
* That's faster than calling heap_insert() in a loop, because when multiple
* tuples can be inserted on a single page, we can write just a single WAL
* record covering all of them, and only need to lock/unlock the page once.
*
* Note: this leaks memory into the current memory context. You can create a
* temporary context before calling this, if that's a problem.
*/
void
heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
CommandId cid, int options, BulkInsertState bistate)
{
TransactionId xid = GetCurrentTransactionId();
HeapTuple *heaptuples;
int i;
int ndone;
PGAlignedBlock scratch;
Page page;
Buffer vmbuffer = InvalidBuffer;
bool needwal;
Size saveFreeSpace;
bool need_tuple_data = RelationIsLogicallyLogged(relation);
bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
bool starting_with_empty_page = false;
int npages = 0;
int npages_used = 0;
Fix logical decoding error when system table w/ toast is repeatedly rewritten. Repeatedly rewriting a mapped catalog table with VACUUM FULL or CLUSTER could cause logical decoding to fail with: ERROR, "could not map filenode \"%s\" to relation OID" To trigger the problem the rewritten catalog had to have live tuples with toasted columns. The problem was triggered as during catalog table rewrites the heap_insert() check that prevents logical decoding information to be emitted for system catalogs, failed to treat the new heap's toast table as a system catalog (because the new heap is not recognized as a catalog table via RelationIsLogicallyLogged()). The relmapper, in contrast to the normal catalog contents, does not contain historical information. After a single rewrite of a mapped table the new relation is known to the relmapper, but if the table is rewritten twice before logical decoding occurs, the relfilenode cannot be mapped to a relation anymore. Which then leads us to error out. This only happens for toast tables, because the main table contents aren't re-inserted with heap_insert(). The fix is simple, add a new heap_insert() flag that prevents logical decoding information from being emitted, and accept during decoding that there might not be tuple data for toast tables. Unfortunately that does not fix pre-existing logical decoding errors. Doing so would require not throwing an error when a filenode cannot be mapped to a relation during decoding, and that seems too likely to hide bugs. If it's crucial to fix decoding for an existing slot, temporarily changing the ERROR in ReorderBufferCommit() to a WARNING appears to be the best fix. Author: Andres Freund Discussion: https://postgr.es/m/20180914021046.oi7dm4ra3ot2g2kt@alap3.anarazel.de Backpatch: 9.4-, where logical decoding was introduced
2018-10-10 22:53:02 +02:00
/* currently not needed (thus unsupported) for heap_multi_insert() */
Assert(!(options & HEAP_INSERT_NO_LOGICAL));
Fix logical decoding error when system table w/ toast is repeatedly rewritten. Repeatedly rewriting a mapped catalog table with VACUUM FULL or CLUSTER could cause logical decoding to fail with: ERROR, "could not map filenode \"%s\" to relation OID" To trigger the problem the rewritten catalog had to have live tuples with toasted columns. The problem was triggered as during catalog table rewrites the heap_insert() check that prevents logical decoding information to be emitted for system catalogs, failed to treat the new heap's toast table as a system catalog (because the new heap is not recognized as a catalog table via RelationIsLogicallyLogged()). The relmapper, in contrast to the normal catalog contents, does not contain historical information. After a single rewrite of a mapped table the new relation is known to the relmapper, but if the table is rewritten twice before logical decoding occurs, the relfilenode cannot be mapped to a relation anymore. Which then leads us to error out. This only happens for toast tables, because the main table contents aren't re-inserted with heap_insert(). The fix is simple, add a new heap_insert() flag that prevents logical decoding information from being emitted, and accept during decoding that there might not be tuple data for toast tables. Unfortunately that does not fix pre-existing logical decoding errors. Doing so would require not throwing an error when a filenode cannot be mapped to a relation during decoding, and that seems too likely to hide bugs. If it's crucial to fix decoding for an existing slot, temporarily changing the ERROR in ReorderBufferCommit() to a WARNING appears to be the best fix. Author: Andres Freund Discussion: https://postgr.es/m/20180914021046.oi7dm4ra3ot2g2kt@alap3.anarazel.de Backpatch: 9.4-, where logical decoding was introduced
2018-10-10 22:53:02 +02:00
Skip WAL for new relfilenodes, under wal_level=minimal. Until now, only selected bulk operations (e.g. COPY) did this. If a given relfilenode received both a WAL-skipping COPY and a WAL-logged operation (e.g. INSERT), recovery could lose tuples from the COPY. See src/backend/access/transam/README section "Skipping WAL for New RelFileNode" for the new coding rules. Maintainers of table access methods should examine that section. To maintain data durability, just before commit, we choose between an fsync of the relfilenode and copying its contents to WAL. A new GUC, wal_skip_threshold, guides that choice. If this change slows a workload that creates small, permanent relfilenodes under wal_level=minimal, try adjusting wal_skip_threshold. Users setting a timeout on COMMIT may need to adjust that timeout, and log_min_duration_statement analysis will reflect time consumption moving to COMMIT from commands like COPY. Internally, this requires a reliable determination of whether RollbackAndReleaseCurrentSubTransaction() would unlink a relation's current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the specification of rd_createSubid such that the field is zero when a new rel has an old rd_node. Make relcache.c retain entries for certain dropped relations until end of transaction. Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN. Future servers accept older WAL, so this bump is discretionary. Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert Haas. Heikki Linnakangas and Michael Paquier implemented earlier designs that materially clarified the problem. Reviewed, in earlier designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane, Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout. Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
needwal = RelationNeedsWAL(relation);
saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
/* Toast and set header data in all the slots */
heaptuples = palloc(ntuples * sizeof(HeapTuple));
for (i = 0; i < ntuples; i++)
{
HeapTuple tuple;
tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
slots[i]->tts_tableOid = RelationGetRelid(relation);
tuple->t_tableOid = slots[i]->tts_tableOid;
heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
options);
}
/*
* We're about to do the actual inserts -- but check for conflict first,
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* to minimize the possibility of having to roll back work we've just
* done.
*
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* A check here does not definitively prevent a serialization anomaly;
* that check MUST be done at least past the point of acquiring an
* exclusive buffer content lock on every buffer that will be affected,
* and MAY be done after all inserts are reflected in the buffers and
* those locks are released; otherwise there is a race condition. Since
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* multiple buffers can be locked and unlocked in the loop below, and it
* would not be feasible to identify and lock all of those buffers before
* the loop, we must do a final check at the end.
*
* The check here could be omitted with no loss of correctness; it is
* present strictly as an optimization.
*
* For heap inserts, we only need to check for table-level SSI locks. Our
* new tuples can't possibly conflict with existing tuple locks, and heap
* page locks are only consolidated versions of tuple locks; they do not
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* lock "gaps" as index page locks do. So we don't need to specify a
* buffer when making the call, which makes for a faster check.
*/
CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
ndone = 0;
while (ndone < ntuples)
{
Buffer buffer;
bool all_visible_cleared = false;
bool all_frozen_set = false;
int nthispage;
CHECK_FOR_INTERRUPTS();
/*
* Compute number of pages needed to fit the to-be-inserted tuples in
* the worst case. This will be used to determine how much to extend
* the relation by in RelationGetBufferForTuple(), if needed. If we
* filled a prior page from scratch, we can just update our last
* computation, but if we started with a partially filled page,
* recompute from scratch, the number of potentially required pages
* can vary due to tuples needing to fit onto the page, page headers
* etc.
*/
if (ndone == 0 || !starting_with_empty_page)
{
npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
saveFreeSpace);
npages_used = 0;
}
else
npages_used++;
/*
* Find buffer where at least the next tuple will fit. If the page is
* all-visible, this will also pin the requisite visibility map page.
*
* Also pin visibility map page if COPY FREEZE inserts tuples into an
* empty page. See all_frozen_set below.
*/
buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
InvalidBuffer, options, bistate,
&vmbuffer, NULL,
npages - npages_used);
page = BufferGetPage(buffer);
starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
all_frozen_set = true;
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
/*
* RelationGetBufferForTuple has ensured that the first tuple fits.
* Put that on the page, and then as many other tuples as fit.
*/
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
/*
* For logical decoding we need combo CIDs to properly decode the
* catalog.
*/
if (needwal && need_cids)
log_heap_new_cid(relation, heaptuples[ndone]);
for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
{
HeapTuple heaptup = heaptuples[ndone + nthispage];
if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
break;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
RelationPutHeapTuple(relation, buffer, heaptup, false);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/*
* For logical decoding we need combo CIDs to properly decode the
* catalog.
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
*/
if (needwal && need_cids)
log_heap_new_cid(relation, heaptup);
}
/*
* If the page is all visible, need to clear that, unless we're only
* going to add further frozen rows to it.
*
* If we're only adding already frozen rows to a previously empty
* page, mark it as all-visible.
*/
if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN))
{
all_visible_cleared = true;
PageClearAllVisible(page);
visibilitymap_clear(relation,
BufferGetBlockNumber(buffer),
vmbuffer, VISIBILITYMAP_VALID_BITS);
}
else if (all_frozen_set)
PageSetAllVisible(page);
/*
* XXX Should we set PageSetPrunable on this page ? See heap_insert()
*/
MarkBufferDirty(buffer);
/* XLOG stuff */
if (needwal)
{
XLogRecPtr recptr;
xl_heap_multi_insert *xlrec;
uint8 info = XLOG_HEAP2_MULTI_INSERT;
char *tupledata;
int totaldatalen;
char *scratchptr = scratch.data;
bool init;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
int bufflags = 0;
/*
* If the page was previously empty, we can reinit the page
* instead of restoring the whole thing.
*/
init = starting_with_empty_page;
/* allocate xl_heap_multi_insert struct from the scratch area */
xlrec = (xl_heap_multi_insert *) scratchptr;
scratchptr += SizeOfHeapMultiInsert;
/*
* Allocate offsets array. Unless we're reinitializing the page,
* in that case the tuples are stored in order starting at
* FirstOffsetNumber and we don't need to store the offsets
* explicitly.
*/
if (!init)
scratchptr += nthispage * sizeof(OffsetNumber);
/* the rest of the scratch space is used for tuple data */
tupledata = scratchptr;
/* check that the mutually exclusive flags are not both set */
Assert(!(all_visible_cleared && all_frozen_set));
xlrec->flags = 0;
if (all_visible_cleared)
xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED;
if (all_frozen_set)
xlrec->flags = XLH_INSERT_ALL_FROZEN_SET;
xlrec->ntuples = nthispage;
/*
* Write out an xl_multi_insert_tuple and the tuple data itself
* for each tuple.
*/
for (i = 0; i < nthispage; i++)
{
HeapTuple heaptup = heaptuples[ndone + i];
xl_multi_insert_tuple *tuphdr;
int datalen;
if (!init)
xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
/* xl_multi_insert_tuple needs two-byte alignment. */
tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
tuphdr->t_infomask = heaptup->t_data->t_infomask;
tuphdr->t_hoff = heaptup->t_data->t_hoff;
/* write bitmap [+ padding] [+ oid] + data */
datalen = heaptup->t_len - SizeofHeapTupleHeader;
memcpy(scratchptr,
(char *) heaptup->t_data + SizeofHeapTupleHeader,
datalen);
tuphdr->datalen = datalen;
scratchptr += datalen;
}
totaldatalen = scratchptr - tupledata;
Assert((scratchptr - scratch.data) < BLCKSZ);
if (need_tuple_data)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
/*
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* Signal that this is the last xl_heap_multi_insert record
* emitted by this call to heap_multi_insert(). Needed for logical
* decoding so it knows when to cleanup temporary data.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (ndone + nthispage == ntuples)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (init)
{
info |= XLOG_HEAP_INIT_PAGE;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
bufflags |= REGBUF_WILL_INIT;
}
/*
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* If we're doing logical decoding, include the new tuple data
* even if we take a full-page image of the page.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (need_tuple_data)
bufflags |= REGBUF_KEEP_DATA;
XLogBeginInsert();
XLogRegisterData((char *) xlrec, tupledata - scratch.data);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0, tupledata, totaldatalen);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/* filtering by origin on a row level is much more efficient */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP2_ID, info);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
/*
* If we've frozen everything on the page, update the visibilitymap.
* We're already holding pin on the vmbuffer.
*/
if (all_frozen_set)
{
Assert(PageIsAllVisible(page));
Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer));
/*
* It's fine to use InvalidTransactionId here - this is only used
* when HEAP_INSERT_FROZEN is specified, which intentionally
* violates visibility rules.
*/
visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
InvalidXLogRecPtr, vmbuffer,
InvalidTransactionId,
VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
}
UnlockReleaseBuffer(buffer);
ndone += nthispage;
/*
* NB: Only release vmbuffer after inserting all tuples - it's fairly
* likely that we'll insert into subsequent heap pages that are likely
* to use the same vm page.
*/
}
/* We're done with inserting all tuples, so release the last vmbuffer. */
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
/*
* We're done with the actual inserts. Check for conflicts again, to
* ensure that all rw-conflicts in to these inserts are detected. Without
* this final check, a sequential scan of the heap may have locked the
* table after the "before" check, missing one opportunity to detect the
* conflict, and then scanned the table before the new tuples were there,
* missing the other chance to detect the conflict.
*
* For heap inserts, we only need to check for table-level SSI locks. Our
* new tuples can't possibly conflict with existing tuple locks, and heap
* page locks are only consolidated versions of tuple locks; they do not
* lock "gaps" as index page locks do. So we don't need to specify a
* buffer when making the call.
*/
CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
/*
* If tuples are cachable, mark them for invalidation from the caches in
* case we abort. Note it is OK to do this after releasing the buffer,
* because the heaptuples data structure is all in local memory, not in
* the shared buffer.
*/
if (IsCatalogRelation(relation))
{
for (i = 0; i < ntuples; i++)
CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
}
/* copy t_self fields back to the caller's slots */
for (i = 0; i < ntuples; i++)
slots[i]->tts_tid = heaptuples[i]->t_self;
pgstat_count_heap_insert(relation, ntuples);
}
/*
* simple_heap_insert - insert a tuple
*
* Currently, this routine differs from heap_insert only in supplying
* a default command ID and not allowing access to the speedup options.
*
* This should be used rather than using heap_insert directly in most places
* where we are modifying system catalogs.
*/
Remove WITH OIDS support, change oid catalog column visibility. Previously tables declared WITH OIDS, including a significant fraction of the catalog tables, stored the oid column not as a normal column, but as part of the tuple header. This special column was not shown by default, which was somewhat odd, as it's often (consider e.g. pg_class.oid) one of the more important parts of a row. Neither pg_dump nor COPY included the contents of the oid column by default. The fact that the oid column was not an ordinary column necessitated a significant amount of special case code to support oid columns. That already was painful for the existing, but upcoming work aiming to make table storage pluggable, would have required expanding and duplicating that "specialness" significantly. WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0). Remove it. Removing includes: - CREATE TABLE and ALTER TABLE syntax for declaring the table to be WITH OIDS has been removed (WITH (oids[ = true]) will error out) - pg_dump does not support dumping tables declared WITH OIDS and will issue a warning when dumping one (and ignore the oid column). - restoring an pg_dump archive with pg_restore will warn when restoring a table with oid contents (and ignore the oid column) - COPY will refuse to load binary dump that includes oids. - pg_upgrade will error out when encountering tables declared WITH OIDS, they have to be altered to remove the oid column first. - Functionality to access the oid of the last inserted row (like plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed. The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false) for CREATE TABLE) is still supported. While that requires a bit of support code, it seems unnecessary to break applications / dumps that do not use oids, and are explicit about not using them. The biggest user of WITH OID columns was postgres' catalog. This commit changes all 'magic' oid columns to be columns that are normally declared and stored. To reduce unnecessary query breakage all the newly added columns are still named 'oid', even if a table's column naming scheme would indicate 'reloid' or such. This obviously requires adapting a lot code, mostly replacing oid access via HeapTupleGetOid() with access to the underlying Form_pg_*->oid column. The bootstrap process now assigns oids for all oid columns in genbki.pl that do not have an explicit value (starting at the largest oid previously used), only oids assigned later by oids will be above FirstBootstrapObjectId. As the oid column now is a normal column the special bootstrap syntax for oids has been removed. Oids are not automatically assigned during insertion anymore, all backend code explicitly assigns oids with GetNewOidWithIndex(). For the rare case that insertions into the catalog via SQL are called for the new pg_nextoid() function can be used (which only works on catalog tables). The fact that oid columns on system tables are now normal columns means that they will be included in the set of columns expanded by * (i.e. SELECT * FROM pg_class will now include the table's oid, previously it did not). It'd not technically be hard to hide oid column by default, but that'd mean confusing behavior would either have to be carried forward forever, or it'd cause breakage down the line. While it's not unlikely that further adjustments are needed, the scope/invasiveness of the patch makes it worthwhile to get merge this now. It's painful to maintain externally, too complicated to commit after the code code freeze, and a dependency of a number of other patches. Catversion bump, for obvious reasons. Author: Andres Freund, with contributions by John Naylor Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
void
simple_heap_insert(Relation relation, HeapTuple tup)
{
Remove WITH OIDS support, change oid catalog column visibility. Previously tables declared WITH OIDS, including a significant fraction of the catalog tables, stored the oid column not as a normal column, but as part of the tuple header. This special column was not shown by default, which was somewhat odd, as it's often (consider e.g. pg_class.oid) one of the more important parts of a row. Neither pg_dump nor COPY included the contents of the oid column by default. The fact that the oid column was not an ordinary column necessitated a significant amount of special case code to support oid columns. That already was painful for the existing, but upcoming work aiming to make table storage pluggable, would have required expanding and duplicating that "specialness" significantly. WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0). Remove it. Removing includes: - CREATE TABLE and ALTER TABLE syntax for declaring the table to be WITH OIDS has been removed (WITH (oids[ = true]) will error out) - pg_dump does not support dumping tables declared WITH OIDS and will issue a warning when dumping one (and ignore the oid column). - restoring an pg_dump archive with pg_restore will warn when restoring a table with oid contents (and ignore the oid column) - COPY will refuse to load binary dump that includes oids. - pg_upgrade will error out when encountering tables declared WITH OIDS, they have to be altered to remove the oid column first. - Functionality to access the oid of the last inserted row (like plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed. The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false) for CREATE TABLE) is still supported. While that requires a bit of support code, it seems unnecessary to break applications / dumps that do not use oids, and are explicit about not using them. The biggest user of WITH OID columns was postgres' catalog. This commit changes all 'magic' oid columns to be columns that are normally declared and stored. To reduce unnecessary query breakage all the newly added columns are still named 'oid', even if a table's column naming scheme would indicate 'reloid' or such. This obviously requires adapting a lot code, mostly replacing oid access via HeapTupleGetOid() with access to the underlying Form_pg_*->oid column. The bootstrap process now assigns oids for all oid columns in genbki.pl that do not have an explicit value (starting at the largest oid previously used), only oids assigned later by oids will be above FirstBootstrapObjectId. As the oid column now is a normal column the special bootstrap syntax for oids has been removed. Oids are not automatically assigned during insertion anymore, all backend code explicitly assigns oids with GetNewOidWithIndex(). For the rare case that insertions into the catalog via SQL are called for the new pg_nextoid() function can be used (which only works on catalog tables). The fact that oid columns on system tables are now normal columns means that they will be included in the set of columns expanded by * (i.e. SELECT * FROM pg_class will now include the table's oid, previously it did not). It'd not technically be hard to hide oid column by default, but that'd mean confusing behavior would either have to be carried forward forever, or it'd cause breakage down the line. While it's not unlikely that further adjustments are needed, the scope/invasiveness of the patch makes it worthwhile to get merge this now. It's painful to maintain externally, too complicated to commit after the code code freeze, and a dependency of a number of other patches. Catversion bump, for obvious reasons. Author: Andres Freund, with contributions by John Naylor Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Given infomask/infomask2, compute the bits that must be saved in the
* "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
* xl_heap_lock_updated WAL records.
*
* See fix_infomask_from_infobits.
*/
static uint8
compute_infobits(uint16 infomask, uint16 infomask2)
{
return
((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
/* note we ignore HEAP_XMAX_SHR_LOCK here */
((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
XLHL_KEYS_UPDATED : 0);
}
Fix race when updating a tuple concurrently locked by another process If a tuple is locked, and this lock is later upgraded either to an update or to a stronger lock, and in the meantime some other process tries to lock, update or delete the same tuple, it (the tuple) could end up being updated twice, or having conflicting locks held. The reason for this is that the second updater checks for a change in Xmax value, or in the HEAP_XMAX_IS_MULTI infomask bit, after noticing the first lock; and if there's a change, it restarts and re-evaluates its ability to update the tuple. But it neglected to check for changes in lock strength or in lock-vs-update status when those two properties stayed the same. This would lead it to take the wrong decision and continue with its own update, when in reality it shouldn't do so but instead restart from the top. This could lead to either an assertion failure much later (when a multixact containing multiple updates is detected), or duplicate copies of tuples. To fix, make sure to compare the other relevant infomask bits alongside the Xmax value and HEAP_XMAX_IS_MULTI bit, and restart from the top if necessary. Also, in the belt-and-suspenders spirit, add a check to MultiXactCreateFromMembers that a multixact being created does not have two or more members that are claimed to be updates. This should protect against other bugs that might cause similar bogus situations. Backpatch to 9.3, where the possibility of multixacts containing updates was introduced. (In prior versions it was possible to have the tuple lock upgraded from shared to exclusive, and an update would not restart from the top; yet we're protected against a bug there because there's always a sleep to wait for the locking transaction to complete before continuing to do anything. Really, the fact that tuple locks always conflicted with concurrent updates is what protected against bugs here.) Per report from Andrew Dunstan and Josh Berkus in thread at http://www.postgresql.org/message-id/534C8B33.9050807@pgexperts.com Bug analysis by Andres Freund.
2014-04-24 20:41:55 +02:00
/*
* Given two versions of the same t_infomask for a tuple, compare them and
* return whether the relevant status for a tuple Xmax has changed. This is
* used after a buffer lock has been released and reacquired: we want to ensure
* that the tuple state continues to be the same it was when we previously
* examined it.
*
* Note the Xmax field itself must be compared separately.
*/
static inline bool
xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
{
const uint16 interesting =
HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
Fix race when updating a tuple concurrently locked by another process If a tuple is locked, and this lock is later upgraded either to an update or to a stronger lock, and in the meantime some other process tries to lock, update or delete the same tuple, it (the tuple) could end up being updated twice, or having conflicting locks held. The reason for this is that the second updater checks for a change in Xmax value, or in the HEAP_XMAX_IS_MULTI infomask bit, after noticing the first lock; and if there's a change, it restarts and re-evaluates its ability to update the tuple. But it neglected to check for changes in lock strength or in lock-vs-update status when those two properties stayed the same. This would lead it to take the wrong decision and continue with its own update, when in reality it shouldn't do so but instead restart from the top. This could lead to either an assertion failure much later (when a multixact containing multiple updates is detected), or duplicate copies of tuples. To fix, make sure to compare the other relevant infomask bits alongside the Xmax value and HEAP_XMAX_IS_MULTI bit, and restart from the top if necessary. Also, in the belt-and-suspenders spirit, add a check to MultiXactCreateFromMembers that a multixact being created does not have two or more members that are claimed to be updates. This should protect against other bugs that might cause similar bogus situations. Backpatch to 9.3, where the possibility of multixacts containing updates was introduced. (In prior versions it was possible to have the tuple lock upgraded from shared to exclusive, and an update would not restart from the top; yet we're protected against a bug there because there's always a sleep to wait for the locking transaction to complete before continuing to do anything. Really, the fact that tuple locks always conflicted with concurrent updates is what protected against bugs here.) Per report from Andrew Dunstan and Josh Berkus in thread at http://www.postgresql.org/message-id/534C8B33.9050807@pgexperts.com Bug analysis by Andres Freund.
2014-04-24 20:41:55 +02:00
if ((new_infomask & interesting) != (old_infomask & interesting))
return true;
return false;
}
/*
* heap_delete - delete a tuple
*
* See table_tuple_delete() for an explanation of the parameters, except that
* this routine directly takes a tuple rather than a slot.
*
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
* t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
* only for TM_SelfModified, since we cannot obtain cmax from a combo CID
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* generated by another transaction).
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result
heap_delete(Relation relation, ItemPointer tid,
CommandId cid, Snapshot crosscheck, bool wait,
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_FailureData *tmfd, bool changingPart)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result result;
TransactionId xid = GetCurrentTransactionId();
1998-11-27 20:52:36 +01:00
ItemId lp;
HeapTupleData tp;
Page page;
BlockNumber block;
Buffer buffer;
Buffer vmbuffer = InvalidBuffer;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
TransactionId new_xmax;
uint16 new_infomask,
new_infomask2;
bool have_tuple_lock = false;
bool iscombo;
bool all_visible_cleared = false;
HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
bool old_key_copied = false;
Assert(ItemPointerIsValid(tid));
/*
* Forbid this during a parallel operation, lest it allocate a combo CID.
* Other workers might need that combo CID for visibility checks, and we
* have no provision for broadcasting it to them.
*/
if (IsInParallelMode())
ereport(ERROR,
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
errmsg("cannot delete tuples during a parallel operation")));
block = ItemPointerGetBlockNumber(tid);
buffer = ReadBuffer(relation, block);
page = BufferGetPage(buffer);
/*
* Before locking the buffer, pin the visibility map page if it appears to
* be necessary. Since we haven't got the lock yet, someone else might be
* in the middle of changing this, so we'll need to recheck after we have
* the lock.
*/
if (PageIsAllVisible(page))
visibilitymap_pin(relation, block, &vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
Assert(ItemIdIsNormal(lp));
tp.t_tableOid = RelationGetRelid(relation);
tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
tp.t_len = ItemIdGetLength(lp);
tp.t_self = *tid;
l1:
/*
* If we didn't pin the visibility map page and the page has become all
* visible while we were busy locking the buffer, we'll have to unlock and
* re-lock, to avoid holding the buffer lock across an I/O. That's a bit
* unfortunate, but hopefully shouldn't happen often.
*/
if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
{
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
visibilitymap_pin(relation, block, &vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
}
result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
1999-05-25 18:15:34 +02:00
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result == TM_Invisible)
1997-08-06 04:08:39 +02:00
{
UnlockReleaseBuffer(buffer);
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("attempted to delete invisible tuple")));
1997-08-06 04:08:39 +02:00
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
else if (result == TM_BeingModified && wait)
{
TransactionId xwait;
uint16 infomask;
/* must copy state data before unlocking buffer */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
infomask = tp.t_data->t_infomask;
/*
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
* Sleep until concurrent transaction ends -- except when there's a
* single locker and it's our own transaction. Note we don't care
* which lock mode the locker has, because we need the strongest one.
*
* Before sleeping, we need to acquire tuple lock to establish our
* priority for the tuple (see heap_lock_tuple). LockTuple will
* release us when we are next-in-line for the tuple.
*
* If we are forced to "start over" below, we keep the tuple lock;
* this arranges that we stay at the head of the line while rechecking
* tuple state.
*/
if (infomask & HEAP_XMAX_IS_MULTI)
{
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
bool current_is_member = false;
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
LockTupleExclusive, &current_is_member))
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
{
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
/*
* Acquire the lock, if necessary (but skip it when we're
* requesting a lock and already have one; avoids deadlock).
*/
if (!current_is_member)
heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
LockWaitBlock, &have_tuple_lock);
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/* wait for multixact */
MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
relation, &(tp.t_self), XLTW_Delete,
NULL);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/*
* If xwait had just locked the tuple then some other xact
* could update this tuple before we get to this point. Check
* for xmax change, and start over if so.
*
* We also must start over if we didn't pin the VM page, and
* the page has become all visible.
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
*/
if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
xwait))
goto l1;
}
/*
* You might think the multixact is necessarily done here, but not
* so: it could have surviving members, namely our own xact or
* other subxacts of this backend. It is legal for us to delete
* the tuple in either case, however (the latter case is
* essentially a situation of upgrading our former shared lock to
* exclusive). We don't bother changing the on-disk hint bits
* since we are about to overwrite the xmax altogether.
*/
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
else if (!TransactionIdIsCurrentTransactionId(xwait))
{
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/*
* Wait for regular transaction to end; but first, acquire tuple
* lock.
*/
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
LockWaitBlock, &have_tuple_lock);
XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/*
* xwait is done, but if xwait had just locked the tuple then some
* other xact could update this tuple before we get to this point.
* Check for xmax change, and start over if so.
*
* We also must start over if we didn't pin the VM page, and the
* page has become all visible.
*/
if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
xwait))
goto l1;
/* Otherwise check if it committed or aborted */
UpdateXmaxHintBits(tp.t_data, buffer, xwait);
}
/*
* We may overwrite if previous xmax aborted, or if it committed but
* only locked the tuple without updating it.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
HeapTupleHeaderIsOnlyLocked(tp.t_data))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Updated;
else
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Deleted;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (crosscheck != InvalidSnapshot && result == TM_Ok)
{
/* Perform additional check for transaction-snapshot mode RI updates */
if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Updated;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result != TM_Ok)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(result == TM_SelfModified ||
result == TM_Updated ||
result == TM_Deleted ||
result == TM_BeingModified);
Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(result != TM_Updated ||
!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
tmfd->ctid = tp.t_data->t_ctid;
tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
if (result == TM_SelfModified)
tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
else
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
tmfd->cmax = InvalidCommandId;
UnlockReleaseBuffer(buffer);
if (have_tuple_lock)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
return result;
}
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
/*
* We're about to do the actual delete -- check for conflict first, to
* avoid possibly having to roll back work we've just done.
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
*
* This is safe without a recheck as long as there is no possibility of
* another process scanning the page between this check and the delete
* being visible to the scan (i.e., an exclusive buffer content lock is
* continuously held from this point until the tuple delete is visible).
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
*/
CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
/* replace cid with a combo CID if necessary */
HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
/*
* Compute replica identity tuple before entering the critical section so
* we don't PANIC upon a memory allocation failure.
*/
old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
/*
* If this is the first possibly-multixact-able operation in the current
* transaction, set my per-backend OldestMemberMXactId setting. We can be
* certain that the transaction will never become a member of any older
* MultiXactIds than that. (We have to do this even if we end up just
* using our own TransactionId below, since some other backend could
* incorporate our XID into a MultiXact immediately afterwards.)
*/
MultiXactIdSetOldestMember();
compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
tp.t_data->t_infomask, tp.t_data->t_infomask2,
xid, LockTupleExclusive, true,
&new_xmax, &new_infomask, &new_infomask2);
START_CRIT_SECTION();
/*
* If this transaction commits, the tuple will become DEAD sooner or
* later. Set flag that this page is a candidate for pruning once our xid
* falls below the OldestXmin horizon. If the transaction finally aborts,
* the subsequent page pruning will be a no-op and the hint will be
* cleared.
*/
PageSetPrunable(page, xid);
if (PageIsAllVisible(page))
{
all_visible_cleared = true;
PageClearAllVisible(page);
visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
vmbuffer, VISIBILITYMAP_VALID_BITS);
}
/* store transaction information of xact deleting the tuple */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
tp.t_data->t_infomask |= new_infomask;
tp.t_data->t_infomask2 |= new_infomask2;
HeapTupleHeaderClearHotUpdated(tp.t_data);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
/* Make sure there is no forward chain link in t_ctid */
tp.t_data->t_ctid = tp.t_self;
2018-04-07 22:24:10 +02:00
/* Signal that this is actually a move into another partition */
if (changingPart)
HeapTupleHeaderSetMovedPartitions(tp.t_data);
MarkBufferDirty(buffer);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
/*
* XLOG stuff
*
* NB: heap_abort_speculative() uses the same xlog record and replay
* routines.
*/
if (RelationNeedsWAL(relation))
2000-06-02 12:20:27 +02:00
{
xl_heap_delete xlrec;
xl_heap_header xlhdr;
XLogRecPtr recptr;
/*
* For logical decode we need combo CIDs to properly decode the
* catalog
*/
if (RelationIsAccessibleInLogicalDecoding(relation))
log_heap_new_cid(relation, &tp);
2018-04-07 22:24:10 +02:00
xlrec.flags = 0;
if (all_visible_cleared)
xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
if (changingPart)
xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
tp.t_data->t_infomask2);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xlrec.xmax = new_xmax;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (old_key_tuple != NULL)
{
if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
else
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
}
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
/*
* Log replica identity of the deleted tuple if there is one
*/
if (old_key_tuple != NULL)
{
xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
XLogRegisterData((char *) old_key_tuple->t_data
+ SizeofHeapTupleHeader,
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
old_key_tuple->t_len
- SizeofHeapTupleHeader);
}
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/* filtering by origin on a row level is much more efficient */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
2000-06-02 12:20:27 +02:00
PageSetLSN(page, recptr);
2000-06-02 12:20:27 +02:00
}
END_CRIT_SECTION();
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
/*
* If the tuple has toasted out-of-line attributes, we need to delete
* those items too. We have to do this before releasing the buffer
* because we need to look at the contents of the tuple, but it's OK to
* release the content lock on the buffer first.
*/
if (relation->rd_rel->relkind != RELKIND_RELATION &&
relation->rd_rel->relkind != RELKIND_MATVIEW)
{
/* toast table entries should never be recursively toasted */
Assert(!HeapTupleHasExternal(&tp));
}
else if (HeapTupleHasExternal(&tp))
heap_toast_delete(relation, &tp, false);
/*
* Mark tuple for invalidation from system caches at next command
* boundary. We have to do this before releasing the buffer because we
* need to look at the contents of the tuple.
*/
CacheInvalidateHeapTuple(relation, &tp, NULL);
/* Now we can release the buffer */
ReleaseBuffer(buffer);
/*
* Release the lmgr tuple lock, if we had it.
*/
if (have_tuple_lock)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
pgstat_count_heap_delete(relation);
if (old_key_tuple != NULL && old_key_copied)
heap_freetuple(old_key_tuple);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
}
/*
* simple_heap_delete - delete a tuple
*
* This routine may be used to delete a tuple when concurrent updates of
* the target tuple are not expected (for example, because we have a lock
* on the relation associated with the tuple). Any failure is reported
* via ereport().
*/
void
simple_heap_delete(Relation relation, ItemPointer tid)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result result;
TM_FailureData tmfd;
result = heap_delete(relation, tid,
GetCurrentCommandId(true), InvalidSnapshot,
true /* wait for commit */ ,
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
&tmfd, false /* changingPart */ );
switch (result)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_SelfModified:
/* Tuple was already updated in current command? */
elog(ERROR, "tuple already updated by self");
break;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_Ok:
/* done successfully */
break;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_Updated:
elog(ERROR, "tuple concurrently updated");
break;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_Deleted:
elog(ERROR, "tuple concurrently deleted");
break;
default:
elog(ERROR, "unrecognized heap_delete status: %u", result);
break;
}
}
/*
1999-11-24 01:44:37 +01:00
* heap_update - replace a tuple
*
* See table_tuple_update() for an explanation of the parameters, except that
* this routine directly takes a tuple rather than a slot.
*
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
* t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
* only for TM_SelfModified, since we cannot obtain cmax from a combo CID
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* generated by another transaction).
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result
1999-11-24 01:44:37 +01:00
heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
CommandId cid, Snapshot crosscheck, bool wait,
TM_FailureData *tmfd, LockTupleMode *lockmode,
TU_UpdateIndexes *update_indexes)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result result;
TransactionId xid = GetCurrentTransactionId();
Bitmapset *hot_attrs;
Bitmapset *sum_attrs;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Bitmapset *key_attrs;
Bitmapset *id_attrs;
Bitmapset *interesting_attrs;
Bitmapset *modified_attrs;
1998-11-27 20:52:36 +01:00
ItemId lp;
HeapTupleData oldtup;
HeapTuple heaptup;
HeapTuple old_key_tuple = NULL;
bool old_key_copied = false;
Page page;
BlockNumber block;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
MultiXactStatus mxact_status;
Buffer buffer,
newbuf,
vmbuffer = InvalidBuffer,
vmbuffer_new = InvalidBuffer;
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
bool need_toast;
Size newtupsize,
pagefree;
bool have_tuple_lock = false;
bool iscombo;
bool use_hot_update = false;
bool summarized_update = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
bool key_intact;
bool all_visible_cleared = false;
bool all_visible_cleared_new = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
bool checked_lockers;
bool locker_remains;
bool id_has_external = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
TransactionId xmax_new_tuple,
xmax_old_tuple;
uint16 infomask_old_tuple,
infomask2_old_tuple,
infomask_new_tuple,
infomask2_new_tuple;
Assert(ItemPointerIsValid(otid));
Fix mishandling of resjunk columns in ON CONFLICT ... UPDATE tlists. It's unusual to have any resjunk columns in an ON CONFLICT ... UPDATE list, but it can happen when MULTIEXPR_SUBLINK SubPlans are present. If it happens, the ON CONFLICT UPDATE code path would end up storing tuples that include the values of the extra resjunk columns. That's fairly harmless in the short run, but if new columns are added to the table then the values would become accessible, possibly leading to malfunctions if they don't match the datatypes of the new columns. This had escaped notice through a confluence of missing sanity checks, including * There's no cross-check that a tuple presented to heap_insert or heap_update matches the table rowtype. While it's difficult to check that fully at reasonable cost, we can easily add assertions that there aren't too many columns. * The output-column-assignment cases in execExprInterp.c lacked any sanity checks on the output column numbers, which seems like an oversight considering there are plenty of assertion checks on input column numbers. Add assertions there too. * We failed to apply nodeModifyTable's ExecCheckPlanOutput() to the ON CONFLICT UPDATE tlist. That wouldn't have caught this specific error, since that function is chartered to ignore resjunk columns; but it sure seems like a bad omission now that we've seen this bug. In HEAD, the right way to fix this is to make the processing of ON CONFLICT UPDATE tlists work the same as regular UPDATE tlists now do, that is don't add "SET x = x" entries, and use ExecBuildUpdateProjection to evaluate the tlist and combine it with old values of the not-set columns. This adds a little complication to ExecBuildUpdateProjection, but allows removal of a comparable amount of now-dead code from the planner. In the back branches, the most expedient solution seems to be to (a) use an output slot for the ON CONFLICT UPDATE projection that actually matches the target table, and then (b) invent a variant of ExecBuildProjectionInfo that can be told to not store values resulting from resjunk columns, so it doesn't try to store into nonexistent columns of the output slot. (We can't simply ignore the resjunk columns altogether; they have to be evaluated for MULTIEXPR_SUBLINK to work.) This works back to v10. In 9.6, projections work much differently and we can't cheaply give them such an option. The 9.6 version of this patch works by inserting a JunkFilter when it's necessary to get rid of resjunk columns. In addition, v11 and up have the reverse problem when trying to perform ON CONFLICT UPDATE on a partitioned table. Through a further oversight, adjust_partition_tlist() discarded resjunk columns when re-ordering the ON CONFLICT UPDATE tlist to match a partition. This accidentally prevented the storing-bogus-tuples problem, but at the cost that MULTIEXPR_SUBLINK cases didn't work, typically crashing if more than one row has to be updated. Fix by preserving resjunk columns in that routine. (I failed to resist the temptation to add more assertions there too, and to do some minor code beautification.) Per report from Andres Freund. Back-patch to all supported branches. Security: CVE-2021-32028
2021-05-10 17:02:29 +02:00
/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
RelationGetNumberOfAttributes(relation));
/*
* Forbid this during a parallel operation, lest it allocate a combo CID.
* Other workers might need that combo CID for visibility checks, and we
* have no provision for broadcasting it to them.
*/
if (IsInParallelMode())
ereport(ERROR,
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
errmsg("cannot update tuples during a parallel operation")));
/*
* Fetch the list of attributes to be checked for various operations.
*
* For HOT considerations, this is wasted effort if we fail to update or
* have to put the new tuple on a different page. But we must compute the
* list before obtaining buffer lock --- in the worst case, if we are
* doing an update on one of the relevant system catalogs, we could
* deadlock if we try to fetch the list later. In any case, the relcache
* caches the data so this is usually pretty cheap.
*
* We also need columns used by the replica identity and columns that are
* considered the "key" of rows in the table.
*
* Note that we get copies of each bitmap, so we need not worry about
* relcache flush happening midway through.
*/
hot_attrs = RelationGetIndexAttrBitmap(relation,
INDEX_ATTR_BITMAP_HOT_BLOCKING);
sum_attrs = RelationGetIndexAttrBitmap(relation,
INDEX_ATTR_BITMAP_SUMMARIZED);
key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
id_attrs = RelationGetIndexAttrBitmap(relation,
INDEX_ATTR_BITMAP_IDENTITY_KEY);
interesting_attrs = NULL;
interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
interesting_attrs = bms_add_members(interesting_attrs, sum_attrs);
interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
block = ItemPointerGetBlockNumber(otid);
buffer = ReadBuffer(relation, block);
page = BufferGetPage(buffer);
/*
* Before locking the buffer, pin the visibility map page if it appears to
* be necessary. Since we haven't got the lock yet, someone else might be
* in the middle of changing this, so we'll need to recheck after we have
* the lock.
*/
if (PageIsAllVisible(page))
visibilitymap_pin(relation, block, &vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
Assert(ItemIdIsNormal(lp));
/*
* Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
* properly.
*/
oldtup.t_tableOid = RelationGetRelid(relation);
oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1998-11-27 20:52:36 +01:00
oldtup.t_len = ItemIdGetLength(lp);
oldtup.t_self = *otid;
2001-03-22 05:01:46 +01:00
/* the new tuple is ready, except for this: */
newtup->t_tableOid = RelationGetRelid(relation);
/*
* Determine columns modified by the update. Additionally, identify
* whether any of the unmodified replica identity key attributes in the
* old tuple is externally stored or not. This is required because for
* such attributes the flattened value won't be WAL logged as part of the
* new tuple so we must include it as part of the old_key_tuple. See
* ExtractReplicaIdentity.
*/
modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs,
id_attrs, &oldtup,
newtup, &id_has_external);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* If we're not updating any "key" column, we can grab a weaker lock type.
* This allows for more concurrency when we are running simultaneously
* with foreign key checks.
*
* Note that if a column gets detoasted while executing the update, but
* the value ends up being the same, this test will fail and we will use
* the stronger lock. This is acceptable; the important case to optimize
* is updates that don't manipulate key columns, not those that
* serendipitously arrive at the same key values.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
if (!bms_overlap(modified_attrs, key_attrs))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
*lockmode = LockTupleNoKeyExclusive;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
mxact_status = MultiXactStatusNoKeyUpdate;
key_intact = true;
/*
* If this is the first possibly-multixact-able operation in the
* current transaction, set my per-backend OldestMemberMXactId
* setting. We can be certain that the transaction will never become a
* member of any older MultiXactIds than that. (We have to do this
* even if we end up just using our own TransactionId below, since
* some other backend could incorporate our XID into a MultiXact
* immediately afterwards.)
*/
MultiXactIdSetOldestMember();
}
else
{
*lockmode = LockTupleExclusive;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
mxact_status = MultiXactStatusUpdate;
key_intact = false;
}
/*
* Note: beyond this point, use oldtup not otid to refer to old tuple.
* otid may very well point at newtup->t_self, which we will overwrite
* with the new tuple's location, so there's great risk of confusion if we
* use otid anymore.
*/
l2:
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
checked_lockers = false;
locker_remains = false;
result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
1999-05-25 18:15:34 +02:00
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/* see below about the "no wait" case */
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(result != TM_BeingModified || wait);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result == TM_Invisible)
{
UnlockReleaseBuffer(buffer);
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("attempted to update invisible tuple")));
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
else if (result == TM_BeingModified && wait)
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
TransactionId xwait;
uint16 infomask;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
bool can_continue = false;
/*
* XXX note that we don't consider the "no wait" case here. This
* isn't a problem currently because no caller uses that case, but it
* should be fixed if such a caller is introduced. It wasn't a
* problem previously because this code would always wait, but now
* that some tuple locks do not conflict with one of the lock modes we
* use, it is possible that this case is interesting to handle
* specially.
*
* This may cause failures with third-party code that calls
* heap_update directly.
*/
/* must copy state data before unlocking buffer */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
infomask = oldtup.t_data->t_infomask;
/*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* Now we have to do something about the existing locker. If it's a
* multi, sleep on it; we might be awakened before it is completely
* gone (or even not sleep at all in some cases); we need to preserve
* it as locker, unless it is gone completely.
*
* If it's not a multi, we need to check for sleeping conditions
* before actually going to sleep. If the update doesn't conflict
* with the locks, we just continue without sleeping (but making sure
* it is preserved).
*
* Before sleeping, we need to acquire tuple lock to establish our
* priority for the tuple (see heap_lock_tuple). LockTuple will
* release us when we are next-in-line for the tuple. Note we must
* not acquire the tuple lock until we're sure we're going to sleep;
* otherwise we're open for race conditions with other transactions
* holding the tuple lock which sleep on us.
*
* If we are forced to "start over" below, we keep the tuple lock;
* this arranges that we stay at the head of the line while rechecking
* tuple state.
*/
if (infomask & HEAP_XMAX_IS_MULTI)
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
TransactionId update_xact;
int remain;
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
bool current_is_member = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
*lockmode, &current_is_member))
{
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
/*
* Acquire the lock, if necessary (but skip it when we're
* requesting a lock and already have one; avoids deadlock).
*/
if (!current_is_member)
heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
LockWaitBlock, &have_tuple_lock);
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/* wait for multixact */
MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
relation, &oldtup.t_self, XLTW_Update,
&remain);
checked_lockers = true;
locker_remains = remain != 0;
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/*
* If xwait had just locked the tuple then some other xact
* could update this tuple before we get to this point. Check
* for xmax change, and start over if so.
*/
if (xmax_infomask_changed(oldtup.t_data->t_infomask,
infomask) ||
!TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
xwait))
goto l2;
}
/*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* Note that the multixact may not be done by now. It could have
* surviving members; our own xact or other subxacts of this
* backend, and also any other concurrent transaction that locked
* the tuple with LockTupleKeyShare if we only got
* LockTupleNoKeyExclusive. If this is the case, we have to be
* careful to mark the updated tuple with the surviving members in
* Xmax.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*
* Note that there could have been another update in the
* MultiXact. In that case, we need to check whether it committed
* or aborted. If it aborted we are safe to update it again;
* otherwise there is an update conflict, and we have to return
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* TableTuple{Deleted, Updated} below.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*
* In the LockTupleExclusive case, we still need to preserve the
* surviving members: those would include the tuple locks we had
* before this one, which are important to keep in case this
* subxact aborts.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
else
update_xact = InvalidTransactionId;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* There was no UPDATE in the MultiXact; or it aborted. No
* TransactionIdIsInProgress() call needed here, since we called
* MultiXactIdWait() above.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (!TransactionIdIsValid(update_xact) ||
TransactionIdDidAbort(update_xact))
can_continue = true;
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
}
else if (TransactionIdIsCurrentTransactionId(xwait))
{
/*
* The only locker is ourselves; we can avoid grabbing the tuple
* lock here, but must preserve our locking information.
*/
checked_lockers = true;
locker_remains = true;
can_continue = true;
}
else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
{
/*
* If it's just a key-share locker, and we're not changing the key
* columns, we don't need to wait for it to end; but we need to
* preserve it as locker.
*/
checked_lockers = true;
locker_remains = true;
can_continue = true;
}
else
{
/*
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
* Wait for regular transaction to end; but first, acquire tuple
* lock.
*/
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
LockWaitBlock, &have_tuple_lock);
XactLockTableWait(xwait, relation, &oldtup.t_self,
XLTW_Update);
checked_lockers = true;
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/*
* xwait is done, but if xwait had just locked the tuple then some
* other xact could update this tuple before we get to this point.
* Check for xmax change, and start over if so.
*/
if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
!TransactionIdEquals(xwait,
HeapTupleHeaderGetRawXmax(oldtup.t_data)))
goto l2;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/* Otherwise check if it committed or aborted */
UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
can_continue = true;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (can_continue)
result = TM_Ok;
else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Updated;
else
result = TM_Deleted;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (crosscheck != InvalidSnapshot && result == TM_Ok)
{
/* Perform additional check for transaction-snapshot mode RI updates */
if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
{
result = TM_Updated;
Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
}
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result != TM_Ok)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(result == TM_SelfModified ||
result == TM_Updated ||
result == TM_Deleted ||
result == TM_BeingModified);
Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(result != TM_Updated ||
!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
tmfd->ctid = oldtup.t_data->t_ctid;
tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
if (result == TM_SelfModified)
tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
else
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
tmfd->cmax = InvalidCommandId;
UnlockReleaseBuffer(buffer);
if (have_tuple_lock)
UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
*update_indexes = TU_None;
bms_free(hot_attrs);
bms_free(sum_attrs);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
bms_free(key_attrs);
bms_free(id_attrs);
bms_free(modified_attrs);
bms_free(interesting_attrs);
return result;
}
/*
* If we didn't pin the visibility map page and the page has become all
* visible while we were busy locking the buffer, or during some
* subsequent window during which we had it unlocked, we'll have to unlock
* and re-lock, to avoid holding the buffer lock across an I/O. That's a
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* bit unfortunate, especially since we'll now have to recheck whether the
* tuple has been locked or updated under us, but hopefully it won't
* happen very often.
*/
if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
{
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
visibilitymap_pin(relation, block, &vmbuffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
goto l2;
}
/* Fill in transaction status data */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* If the tuple we're updating is locked, we need to preserve the locking
* info in the old tuple's Xmax. Prepare a new Xmax value for this.
*/
compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
oldtup.t_data->t_infomask,
oldtup.t_data->t_infomask2,
xid, *lockmode, true,
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
&xmax_old_tuple, &infomask_old_tuple,
&infomask2_old_tuple);
/*
* And also prepare an Xmax value for the new copy of the tuple. If there
* was no xmax previously, or there was one but all lockers are now gone,
* then use InvalidTransactionId; otherwise, get the xmax from the old
* tuple. (In rare cases that might also be InvalidTransactionId and yet
* not have the HEAP_XMAX_INVALID bit set; that's fine.)
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
Fix handling of multixacts predating pg_upgrade After pg_upgrade, it is possible that some tuples' Xmax have multixacts corresponding to the old installation; such multixacts cannot have running members anymore. In many code sites we already know not to read them and clobber them silently, but at least when VACUUM tries to freeze a multixact or determine whether one needs freezing, there's an attempt to resolve it to its member transactions by calling GetMultiXactIdMembers, and if the multixact value is "in the future" with regards to the current valid multixact range, an error like this is raised: ERROR: MultiXactId 123 has not been created yet -- apparent wraparound and vacuuming fails. Per discussion with Andrew Gierth, it is completely bogus to try to resolve multixacts coming from before a pg_upgrade, regardless of where they stand with regards to the current valid multixact range. It's possible to get from under this problem by doing SELECT FOR UPDATE of the problem tuples, but if tables are large, this is slow and tedious, so a more thorough solution is desirable. To fix, we realize that multixacts in xmax created in 9.2 and previous have a specific bit pattern that is never used in 9.3 and later (we already knew this, per comments and infomask tests sprinkled in various places, but we weren't leveraging this knowledge appropriately). Whenever the infomask of the tuple matches that bit pattern, we just ignore the multixact completely as if Xmax wasn't set; or, in the case of tuple freezing, we act as if an unwanted value is set and clobber it without decoding. This guarantees that no errors will be raised, and that the values will be progressively removed until all tables are clean. Most callers of GetMultiXactIdMembers are patched to recognize directly that the value is a removable "empty" multixact and avoid calling GetMultiXactIdMembers altogether. To avoid changing the signature of GetMultiXactIdMembers() in back branches, we keep the "allow_old" boolean flag but rename it to "from_pgupgrade"; if the flag is true, we always return an empty set instead of looking up the multixact. (I suppose we could remove the argument in the master branch, but I chose not to do so in this commit). This was broken all along, but the error-facing message appeared first because of commit 8e9a16ab8f7f and was partially fixed in a25c2b7c4db3. This fix, backpatched all the way back to 9.3, goes approximately in the same direction as a25c2b7c4db3 but should cover all cases. Bug analysis by Andrew Gierth and Álvaro Herrera. A number of public reports match this bug: https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-25 00:29:28 +02:00
HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
(checked_lockers && !locker_remains))
xmax_new_tuple = InvalidTransactionId;
else
xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
if (!TransactionIdIsValid(xmax_new_tuple))
{
infomask_new_tuple = HEAP_XMAX_INVALID;
infomask2_new_tuple = 0;
}
else
{
/*
* If we found a valid Xmax for the new tuple, then the infomask bits
* to use on the new tuple depend on what was there on the old one.
* Note that since we're doing an update, the only possibility is that
* the lockers had FOR KEY SHARE lock.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
{
GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
&infomask2_new_tuple);
}
else
{
infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
infomask2_new_tuple = 0;
}
}
/*
* Prepare the new tuple with the appropriate initial values of Xmin and
* Xmax, as well as initial infomask bits as computed above.
*/
1998-11-27 20:52:36 +01:00
newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
HeapTupleHeaderSetXmin(newtup->t_data, xid);
HeapTupleHeaderSetCmin(newtup->t_data, cid);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
newtup->t_data->t_infomask2 |= infomask2_new_tuple;
HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
/*
* Replace cid with a combo CID if necessary. Note that we already put
* the plain cid into the new tuple.
*/
HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
/*
* If the toaster needs to be activated, OR if the new tuple will not fit
* on the same page as the old, then we need to release the content lock
* (but not the pin!) on the old tuple's buffer while we are off doing
* TOAST and/or table-file-extension work. We must mark the old tuple to
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
* show that it's locked, else other processes may try to update it
* themselves.
*
* We need to invoke the toaster if there are already any out-of-line
* toasted values present, or if the new tuple is over-threshold.
*/
if (relation->rd_rel->relkind != RELKIND_RELATION &&
relation->rd_rel->relkind != RELKIND_MATVIEW)
{
/* toast table entries should never be recursively toasted */
Assert(!HeapTupleHasExternal(&oldtup));
Assert(!HeapTupleHasExternal(newtup));
need_toast = false;
}
else
need_toast = (HeapTupleHasExternal(&oldtup) ||
HeapTupleHasExternal(newtup) ||
newtup->t_len > TOAST_TUPLE_THRESHOLD);
pagefree = PageGetHeapFreeSpace(page);
newtupsize = MAXALIGN(newtup->t_len);
if (need_toast || newtupsize > pagefree)
{
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
TransactionId xmax_lock_old_tuple;
uint16 infomask_lock_old_tuple,
infomask2_lock_old_tuple;
bool cleared_all_frozen = false;
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
/*
* To prevent concurrent sessions from updating the tuple, we have to
2017-08-09 00:46:16 +02:00
* temporarily mark it locked, while we release the page-level lock.
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
*
* To satisfy the rule that any xid potentially appearing in a buffer
* written out to disk, we unfortunately have to WAL log this
* temporary modification. We can reuse xl_heap_lock for this
* purpose. If we crash/error before following through with the
* actual update, xmax will be of an aborted transaction, allowing
* other sessions to proceed.
*/
/*
* Compute xmax / infomask appropriate for locking the tuple. This has
2017-08-09 00:46:16 +02:00
* to be done separately from the combo that's going to be used for
* updating, because the potentially created multixact would otherwise
* be wrong.
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
*/
compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
oldtup.t_data->t_infomask,
oldtup.t_data->t_infomask2,
xid, *lockmode, false,
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
&xmax_lock_old_tuple, &infomask_lock_old_tuple,
&infomask2_lock_old_tuple);
Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
START_CRIT_SECTION();
/* Clear obsolete visibility flags ... */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
HeapTupleClearHotUpdated(&oldtup);
/* ... and store info about transaction updating this tuple */
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
Assert(TransactionIdIsValid(xmax_lock_old_tuple));
HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
/* temporarily make it look not-updated, but locked */
oldtup.t_data->t_ctid = oldtup.t_self;
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
/*
* Clear all-frozen bit on visibility map if needed. We could
* immediately reset ALL_VISIBLE, but given that the WAL logging
* overhead would be unchanged, that doesn't seem necessarily
* worthwhile.
*/
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
if (PageIsAllVisible(page) &&
visibilitymap_clear(relation, block, vmbuffer,
VISIBILITYMAP_ALL_FROZEN))
cleared_all_frozen = true;
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
MarkBufferDirty(buffer);
if (RelationNeedsWAL(relation))
{
xl_heap_lock xlrec;
XLogRecPtr recptr;
XLogBeginInsert();
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
xlrec.xmax = xmax_lock_old_tuple;
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
oldtup.t_data->t_infomask2);
xlrec.flags =
cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/*
* Let the toaster do its thing, if needed.
*
* Note: below this point, heaptup is the data we actually intend to
* store into the relation; newtup is the caller's original untoasted
* data.
*/
if (need_toast)
{
/* Note we always use WAL and FSM during updates */
heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
newtupsize = MAXALIGN(heaptup->t_len);
}
else
heaptup = newtup;
/*
* Now, do we need a new page for the tuple, or not? This is a bit
* tricky since someone else could have added tuples to the page while
* we weren't looking. We have to recheck the available space after
* reacquiring the buffer lock. But don't bother to do that if the
* former amount of free space is still not enough; it's unlikely
* there's more free now than before.
*
* What's more, if we need to get a new page, we will need to acquire
* buffer locks on both old and new pages. To avoid deadlock against
* some other backend trying to get the same two locks in the other
* order, we must be consistent about the order we get the locks in.
* We use the rule "lock the lower-numbered page of the relation
* first". To implement this, we must do RelationGetBufferForTuple
* while not holding the lock on the old page, and we must rely on it
* to get the locks on both pages in the correct order.
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
*
* Another consideration is that we need visibility map page pin(s) if
* we will have to clear the all-visible flag on either page. If we
* call RelationGetBufferForTuple, we rely on it to acquire any such
* pins; but if we don't, we have to handle that here. Hence we need
* a loop.
*/
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
for (;;)
{
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
if (newtupsize > pagefree)
{
/* It doesn't fit, must use RelationGetBufferForTuple. */
newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
buffer, 0, NULL,
&vmbuffer_new, &vmbuffer,
0);
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
/* We're all done. */
break;
}
/* Acquire VM page pin if needed and we don't have it. */
if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
visibilitymap_pin(relation, block, &vmbuffer);
/* Re-acquire the lock on the old tuple's page. */
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/* Re-check using the up-to-date free space */
pagefree = PageGetHeapFreeSpace(page);
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
if (newtupsize > pagefree ||
(vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
{
/*
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
* Rats, it doesn't fit anymore, or somebody just now set the
* all-visible flag. We must now unlock and loop to avoid
* deadlock. Fortunately, this path should seldom be taken.
*/
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
}
else
{
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
/* We're all done. */
newbuf = buffer;
Avoid improbable PANIC during heap_update. heap_update needs to clear any existing "all visible" flag on the old tuple's page (and on the new page too, if different). Per coding rules, to do this it must acquire pin on the appropriate visibility-map page while not holding exclusive buffer lock; which creates a race condition since someone else could set the flag whenever we're not holding the buffer lock. The code is supposed to handle that by re-checking the flag after acquiring buffer lock and retrying if it became set. However, one code path through heap_update itself, as well as one in its subroutine RelationGetBufferForTuple, failed to do this. The end result, in the unlikely event that a concurrent VACUUM did set the flag while we're transiently not holding lock, is a non-recurring "PANIC: wrong buffer passed to visibilitymap_clear" failure. This has been seen a few times in the buildfarm since recent VACUUM changes that added code paths that could set the all-visible flag while holding only exclusive buffer lock. Previously, the flag was (usually?) set only after doing LockBufferForCleanup, which would insist on buffer pin count zero, thus preventing the flag from becoming set partway through heap_update. However, it's clear that it's heap_update not VACUUM that's at fault here. What's less clear is whether there is any hazard from these bugs in released branches. heap_update is certainly violating API expectations, but if there is no code path that can set all-visible without a cleanup lock then it's only a latent bug. That's not 100% certain though, besides which we should worry about extensions or future back-patch fixes that could introduce such code paths. I chose to back-patch to v12. Fixing RelationGetBufferForTuple before that would require also back-patching portions of older fixes (notably 0d1fe9f74), which is more code churn than seems prudent to fix a hypothetical issue. Discussion: https://postgr.es/m/2247102.1618008027@sss.pgh.pa.us
2021-04-13 18:17:24 +02:00
break;
}
}
}
else
{
/* No TOAST work needed, and it'll fit on same page */
newbuf = buffer;
heaptup = newtup;
}
2000-07-03 04:54:21 +02:00
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
/*
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* We're about to do the actual update -- check for conflict first, to
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
* avoid possibly having to roll back work we've just done.
*
Fix serialization anomalies due to race conditions on INSERT. On insert the CheckForSerializableConflictIn() test was performed before the page(s) which were going to be modified had been locked (with an exclusive buffer content lock). If another process acquired a relation SIReadLock on the heap and scanned to a page on which an insert was going to occur before the page was so locked, a rw-conflict would be missed, which could allow a serialization anomaly to be missed. The window between the check and the page lock was small, so the bug was generally not noticed unless there was high concurrency with multiple processes inserting into the same table. This was reported by Peter Bailis as bug #11732, by Sean Chittenden as bug #13667, and by others. The race condition was eliminated in heap_insert() by moving the check down below the acquisition of the buffer lock, which had been the very next statement. Because of the loop locking and unlocking multiple buffers in heap_multi_insert() a check was added after all inserts were completed. The check before the start of the inserts was left because it might avoid a large amount of work to detect a serialization anomaly before performing the all of the inserts and the related WAL logging. While investigating this bug, other SSI bugs which were even harder to hit in practice were noticed and fixed, an unnecessary check (covered by another check, so redundant) was removed from heap_update(), and comments were improved. Back-patch to all supported branches. Kevin Grittner and Thomas Munro
2015-10-31 20:43:34 +01:00
* This is safe without a recheck as long as there is no possibility of
* another process scanning the pages between this check and the update
* being visible to the scan (i.e., exclusive buffer content lock(s) are
* continuously held from this point until the tuple update is visible).
*
* For the new tuple the only check needed is at the relation level, but
* since both tuples are in the same relation and the check for oldtup
* will include checking the relation level, there is no benefit to a
* separate check for the new tuple.
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
*/
CheckForSerializableConflictIn(relation, &oldtup.t_self,
BufferGetBlockNumber(buffer));
Implement genuine serializable isolation level. Until now, our Serializable mode has in fact been what's called Snapshot Isolation, which allows some anomalies that could not occur in any serialized ordering of the transactions. This patch fixes that using a method called Serializable Snapshot Isolation, based on research papers by Michael J. Cahill (see README-SSI for full references). In Serializable Snapshot Isolation, transactions run like they do in Snapshot Isolation, but a predicate lock manager observes the reads and writes performed and aborts transactions if it detects that an anomaly might occur. This method produces some false positives, ie. it sometimes aborts transactions even though there is no anomaly. To track reads we implement predicate locking, see storage/lmgr/predicate.c. Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared memory is finite, so when a transaction takes many tuple-level locks on a page, the locks are promoted to a single page-level lock, and further to a single relation level lock if necessary. To lock key values with no matching tuple, a sequential scan always takes a relation-level lock, and an index scan acquires a page-level lock that covers the search key, whether or not there are any matching keys at the moment. A predicate lock doesn't conflict with any regular locks or with another predicate locks in the normal sense. They're only used by the predicate lock manager to detect the danger of anomalies. Only serializable transactions participate in predicate locking, so there should be no extra overhead for for other transactions. Predicate locks can't be released at commit, but must be remembered until all the transactions that overlapped with it have completed. That means that we need to remember an unbounded amount of predicate locks, so we apply a lossy but conservative method of tracking locks for committed transactions. If we run short of shared memory, we overflow to a new "pg_serial" SLRU pool. We don't currently allow Serializable transactions in Hot Standby mode. That would be hard, because even read-only transactions can cause anomalies that wouldn't otherwise occur. Serializable isolation mode now means the new fully serializable level. Repeatable Read gives you the old Snapshot Isolation level that we have always had. Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
/*
* At this point newbuf and buffer are both pinned and locked, and newbuf
* has enough space for the new tuple. If they are the same buffer, only
* one pin is held.
*/
if (newbuf == buffer)
{
/*
* Since the new tuple is going into the same page, we might be able
* to do a HOT update. Check if any of the index columns have been
* changed.
*/
if (!bms_overlap(modified_attrs, hot_attrs))
{
use_hot_update = true;
/*
* If none of the columns that are used in hot-blocking indexes
* were updated, we can apply HOT, but we do still need to check
* if we need to update the summarizing indexes, and update those
* indexes if the columns were updated, or we may fail to detect
* e.g. value bound changes in BRIN minmax indexes.
*/
if (bms_overlap(modified_attrs, sum_attrs))
summarized_update = true;
}
}
else
{
/* Set a hint that the old page could use prune/defrag */
PageSetFull(page);
}
/*
* Compute replica identity tuple before entering the critical section so
* we don't PANIC upon a memory allocation failure.
* ExtractReplicaIdentity() will return NULL if nothing needs to be
* logged. Pass old key required as true only if the replica identity key
* columns are modified or it has external data.
*/
old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
bms_overlap(modified_attrs, id_attrs) ||
id_has_external,
&old_key_copied);
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
2000-07-03 04:54:21 +02:00
/*
* If this transaction commits, the old tuple will become DEAD sooner or
* later. Set flag that this page is a candidate for pruning once our xid
* falls below the OldestXmin horizon. If the transaction finally aborts,
* the subsequent page pruning will be a no-op and the hint will be
* cleared.
*
* XXX Should we set hint on newbuf as well? If the transaction aborts,
* there would be a prunable tuple in the newbuf; but for now we choose
* not to optimize for aborts. Note that heap_xlog_update must be kept in
* sync if this decision changes.
*/
PageSetPrunable(page, xid);
if (use_hot_update)
{
/* Mark the old tuple as HOT-updated */
HeapTupleSetHotUpdated(&oldtup);
/* And mark the new tuple as heap-only */
HeapTupleSetHeapOnly(heaptup);
/* Mark the caller's copy too, in case different from heaptup */
HeapTupleSetHeapOnly(newtup);
}
else
{
/* Make sure tuples are correctly marked as not-HOT */
HeapTupleClearHotUpdated(&oldtup);
HeapTupleClearHeapOnly(heaptup);
HeapTupleClearHeapOnly(newtup);
}
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
Fix torn-page, unlogged xid and further risks from heap_update(). When heap_update needs to look for a page for the new tuple version, because the current one doesn't have sufficient free space, or when columns have to be processed by the tuple toaster, it has to release the lock on the old page during that. Otherwise there'd be lock ordering and lock nesting issues. To avoid concurrent sessions from trying to update / delete / lock the tuple while the page's content lock is released, the tuple's xmax is set to the current session's xid. That unfortunately was done without any WAL logging, thereby violating the rule that no XIDs may appear on disk, without an according WAL record. If the database were to crash / fail over when the page level lock is released, and some activity lead to the page being written out to disk, the xid could end up being reused; potentially leading to the row becoming invisible. There might be additional risks by not having t_ctid point at the tuple itself, without having set the appropriate lock infomask fields. To fix, compute the appropriate xmax/infomask combination for locking the tuple, and perform WAL logging using the existing XLOG_HEAP_LOCK record. That allows the fix to be backpatched. This issue has existed for a long time. There appears to have been partial attempts at preventing dangers, but these never have fully been implemented, and were removed a long time ago, in 11919160 (cf. HEAP_XMAX_UNLOGGED). In master / 9.6, there's an additional issue, namely that the visibilitymap's freeze bit isn't reset at that point yet. Since that's a new issue, introduced only in a892234f830, that'll be fixed in a separate commit. Author: Masahiko Sawada and Andres Freund Reported-By: Different aspects by Thomas Munro, Noah Misch, and others Discussion: CAEepm=3fWAbWryVW9swHyLTY4sXVf0xbLvXqOwUoDiNCx9mBjQ@mail.gmail.com Backpatch: 9.1/all supported versions
2016-07-16 02:49:48 +02:00
/* Clear obsolete visibility flags, possibly set by ourselves above... */
oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
/* ... and store info about transaction updating this tuple */
Assert(TransactionIdIsValid(xmax_old_tuple));
HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
oldtup.t_data->t_infomask |= infomask_old_tuple;
oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
2000-07-03 04:54:21 +02:00
/* record address of new tuple in t_ctid of old one */
oldtup.t_data->t_ctid = heaptup->t_self;
2000-07-03 04:54:21 +02:00
/* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
if (PageIsAllVisible(BufferGetPage(buffer)))
{
all_visible_cleared = true;
PageClearAllVisible(BufferGetPage(buffer));
visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
vmbuffer, VISIBILITYMAP_VALID_BITS);
}
if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
{
all_visible_cleared_new = true;
PageClearAllVisible(BufferGetPage(newbuf));
visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
vmbuffer_new, VISIBILITYMAP_VALID_BITS);
}
if (newbuf != buffer)
MarkBufferDirty(newbuf);
MarkBufferDirty(buffer);
2000-07-03 04:54:21 +02:00
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
XLogRecPtr recptr;
1999-05-25 18:15:34 +02:00
/*
* For logical decoding we need combo CIDs to properly decode the
* catalog.
*/
if (RelationIsAccessibleInLogicalDecoding(relation))
{
log_heap_new_cid(relation, &oldtup);
log_heap_new_cid(relation, heaptup);
}
recptr = log_heap_update(relation, buffer,
newbuf, &oldtup, heaptup,
old_key_tuple,
all_visible_cleared,
all_visible_cleared_new);
2000-07-03 04:54:21 +02:00
if (newbuf != buffer)
{
PageSetLSN(BufferGetPage(newbuf), recptr);
2000-07-03 04:54:21 +02:00
}
PageSetLSN(BufferGetPage(buffer), recptr);
}
END_CRIT_SECTION();
2000-07-03 04:54:21 +02:00
if (newbuf != buffer)
LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/*
* Mark old tuple for invalidation from system caches at next command
* boundary, and mark the new tuple for invalidation in case we abort. We
* have to do this before releasing the buffer because oldtup is in the
* buffer. (heaptup is all in local memory, but it's necessary to process
* both tuple versions in one call to inval.c so we can avoid redundant
* sinval messages.)
*/
CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
/* Now we can release the buffer(s) */
if (newbuf != buffer)
ReleaseBuffer(newbuf);
ReleaseBuffer(buffer);
if (BufferIsValid(vmbuffer_new))
ReleaseBuffer(vmbuffer_new);
if (BufferIsValid(vmbuffer))
ReleaseBuffer(vmbuffer);
/*
* Release the lmgr tuple lock, if we had it.
*/
if (have_tuple_lock)
UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
/*
* If heaptup is a private copy, release it. Don't forget to copy t_self
* back to the caller's image, too.
*/
if (heaptup != newtup)
{
newtup->t_self = heaptup->t_self;
heap_freetuple(heaptup);
}
/*
* If it is a HOT update, the update may still need to update summarized
* indexes, lest we fail to update those summaries and get incorrect
* results (for example, minmax bounds of the block may change with this
* update).
*/
if (use_hot_update)
{
if (summarized_update)
*update_indexes = TU_Summarizing;
else
*update_indexes = TU_None;
}
else
*update_indexes = TU_All;
if (old_key_tuple != NULL && old_key_copied)
heap_freetuple(old_key_tuple);
bms_free(hot_attrs);
bms_free(sum_attrs);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
bms_free(key_attrs);
bms_free(id_attrs);
bms_free(modified_attrs);
bms_free(interesting_attrs);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
}
/*
* Check if the specified attribute's values are the same. Subroutine for
* HeapDetermineColumnsInfo.
*/
static bool
heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
bool isnull1, bool isnull2)
{
Form_pg_attribute att;
/*
* If one value is NULL and other is not, then they are certainly not
* equal
*/
if (isnull1 != isnull2)
return false;
/*
* If both are NULL, they can be considered equal.
*/
if (isnull1)
return true;
/*
* We do simple binary comparison of the two datums. This may be overly
* strict because there can be multiple binary representations for the
* same logical value. But we should be OK as long as there are no false
* positives. Using a type-specific equality operator is messy because
* there could be multiple notions of equality in different operator
* classes; furthermore, we cannot safely invoke user-defined functions
* while holding exclusive buffer lock.
*/
if (attrnum <= 0)
{
/* The only allowed system columns are OIDs, so do this */
return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
}
else
{
Assert(attrnum <= tupdesc->natts);
att = TupleDescAttr(tupdesc, attrnum - 1);
return datumIsEqual(value1, value2, att->attbyval, att->attlen);
}
}
/*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* Check which columns are being updated.
*
* Given an updated tuple, determine (and return into the output bitmapset),
* from those listed as interesting, the set of columns that changed.
*
* has_external indicates if any of the unmodified attributes (from those
* listed as interesting) of the old tuple is a member of external_cols and is
* stored externally.
*/
static Bitmapset *
HeapDetermineColumnsInfo(Relation relation,
Bitmapset *interesting_cols,
Bitmapset *external_cols,
HeapTuple oldtup, HeapTuple newtup,
bool *has_external)
{
int attidx;
Bitmapset *modified = NULL;
TupleDesc tupdesc = RelationGetDescr(relation);
attidx = -1;
while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
/* attidx is zero-based, attrnum is the normal attribute number */
AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
Datum value1,
value2;
bool isnull1,
isnull2;
/*
* If it's a whole-tuple reference, say "not equal". It's not really
* worth supporting this case, since it could only succeed after a
* no-op update, which is hardly a case worth optimizing for.
*/
if (attrnum == 0)
{
modified = bms_add_member(modified, attidx);
continue;
}
/*
* Likewise, automatically say "not equal" for any system attribute
* other than tableOID; we cannot expect these to be consistent in a
* HOT chain, or even to be set correctly yet in the new tuple.
*/
if (attrnum < 0)
{
if (attrnum != TableOidAttributeNumber)
{
modified = bms_add_member(modified, attidx);
continue;
}
}
/*
* Extract the corresponding values. XXX this is pretty inefficient
* if there are many indexed columns. Should we do a single
* heap_deform_tuple call on each tuple, instead? But that doesn't
* work for system columns ...
*/
value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
if (!heap_attr_equals(tupdesc, attrnum, value1,
value2, isnull1, isnull2))
{
modified = bms_add_member(modified, attidx);
continue;
}
/*
* No need to check attributes that can't be stored externally. Note
* that system attributes can't be stored externally.
*/
if (attrnum < 0 || isnull1 ||
TupleDescAttr(tupdesc, attrnum - 1)->attlen != -1)
continue;
/*
* Check if the old tuple's attribute is stored externally and is a
* member of external_cols.
*/
if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) &&
bms_is_member(attidx, external_cols))
*has_external = true;
}
return modified;
}
/*
* simple_heap_update - replace a tuple
*
* This routine may be used to update a tuple when concurrent updates of
* the target tuple are not expected (for example, because we have a lock
* on the relation associated with the tuple). Any failure is reported
* via ereport().
*/
void
simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup,
TU_UpdateIndexes *update_indexes)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result result;
TM_FailureData tmfd;
LockTupleMode lockmode;
result = heap_update(relation, otid, tup,
GetCurrentCommandId(true), InvalidSnapshot,
true /* wait for commit */ ,
&tmfd, &lockmode, update_indexes);
switch (result)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_SelfModified:
/* Tuple was already updated in current command? */
elog(ERROR, "tuple already updated by self");
break;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_Ok:
/* done successfully */
break;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_Updated:
elog(ERROR, "tuple concurrently updated");
break;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
case TM_Deleted:
elog(ERROR, "tuple concurrently deleted");
break;
default:
elog(ERROR, "unrecognized heap_update status: %u", result);
break;
}
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Return the MultiXactStatus corresponding to the given tuple lock mode.
*/
static MultiXactStatus
get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
{
int retval;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (is_update)
retval = tupleLockExtraInfo[mode].updstatus;
else
retval = tupleLockExtraInfo[mode].lockstatus;
if (retval == -1)
elog(ERROR, "invalid lock tuple mode %d/%s", mode,
is_update ? "true" : "false");
return (MultiXactStatus) retval;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/*
* heap_lock_tuple - lock a tuple in shared or exclusive mode
*
* Note that this acquires a buffer pin, which the caller must release.
*
* Input parameters:
* relation: relation containing tuple (caller must hold suitable lock)
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* tid: TID of tuple to lock
* cid: current command ID (used for visibility test, and stored into
* tuple's cmax if lock is successful)
* mode: indicates if shared or exclusive tuple lock is desired
* wait_policy: what to do if tuple lock is not available
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* follow_updates: if true, follow the update chain to also lock descendant
* tuples.
*
* Output parameters:
* *tuple: all fields filled in
* *buffer: set to buffer holding tuple (pinned but not locked at exit)
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* *tmfd: filled in failure cases (see below)
*
* Function results are the same as the ones for table_tuple_lock().
*
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* In the failure cases other than TM_Invisible, the routine fills
* *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
* if necessary), and t_cmax (the last only for TM_SelfModified,
* since we cannot obtain cmax from a combo CID generated by another
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
* transaction).
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* See comments for struct TM_FailureData for additional info.
*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* See README.tuplock for a thorough explanation of this mechanism.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result
heap_lock_tuple(Relation relation, HeapTuple tuple,
CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
bool follow_updates,
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Buffer *buffer, TM_FailureData *tmfd)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result result;
ItemPointer tid = &(tuple->t_self);
ItemId lp;
Page page;
Buffer vmbuffer = InvalidBuffer;
BlockNumber block;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
TransactionId xid,
xmax;
uint16 old_infomask,
new_infomask,
new_infomask2;
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
bool first_time = true;
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
bool skip_tuple_lock = false;
bool have_tuple_lock = false;
bool cleared_all_frozen = false;
*buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
block = ItemPointerGetBlockNumber(tid);
/*
* Before locking the buffer, pin the visibility map page if it appears to
* be necessary. Since we haven't got the lock yet, someone else might be
* in the middle of changing this, so we'll need to recheck after we have
* the lock.
*/
if (PageIsAllVisible(BufferGetPage(*buffer)))
visibilitymap_pin(relation, block, &vmbuffer);
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(*buffer);
lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
Assert(ItemIdIsNormal(lp));
tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
tuple->t_len = ItemIdGetLength(lp);
tuple->t_tableOid = RelationGetRelid(relation);
1999-05-25 18:15:34 +02:00
l3:
result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
1999-05-25 18:15:34 +02:00
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result == TM_Invisible)
{
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
/*
* This is possible, but only when locking a tuple for ON CONFLICT
* UPDATE. We return this value here rather than throwing an error in
* order to give that case the opportunity to throw a more specific
* error.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Invisible;
goto out_locked;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
else if (result == TM_BeingModified ||
result == TM_Updated ||
result == TM_Deleted)
{
TransactionId xwait;
uint16 infomask;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
uint16 infomask2;
bool require_sleep;
ItemPointerData t_ctid;
/* must copy state data before unlocking buffer */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
infomask = tuple->t_data->t_infomask;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
infomask2 = tuple->t_data->t_infomask2;
ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
/*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* If any subtransaction of the current top transaction already holds
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
* a lock as strong as or stronger than what we're requesting, we
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* effectively hold the desired lock already. We *must* succeed
* without trying to take the tuple lock, else we will deadlock
* against anyone wanting to acquire a stronger lock.
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
*
* Note we only do this the first time we loop on the HTSU result;
* there is no point in testing in subsequent passes, because
* evidently our own transaction cannot have acquired a new lock after
* the first time we checked.
*/
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
if (first_time)
{
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
first_time = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
if (infomask & HEAP_XMAX_IS_MULTI)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
int i;
int nmembers;
MultiXactMember *members;
/*
* We don't need to allow old multixacts here; if that had
* been the case, HeapTupleSatisfiesUpdate would have returned
* MayBeUpdated and we wouldn't be here.
*/
nmembers =
GetMultiXactIdMembers(xwait, &members, false,
HEAP_XMAX_IS_LOCKED_ONLY(infomask));
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
for (i = 0; i < nmembers; i++)
{
/* only consider members of our own transaction */
if (!TransactionIdIsCurrentTransactionId(members[i].xid))
continue;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
pfree(members);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_unlocked;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
else
{
/*
* Disable acquisition of the heavyweight tuple lock.
* Otherwise, when promoting a weaker lock, we might
* deadlock with another locker that has acquired the
* heavyweight tuple lock and is waiting for our
* transaction to finish.
*
* Note that in this case we still need to wait for
* the multixact if required, to avoid acquiring
* conflicting locks.
*/
skip_tuple_lock = true;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
if (members)
pfree(members);
}
else if (TransactionIdIsCurrentTransactionId(xwait))
{
switch (mode)
{
case LockTupleKeyShare:
Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
HEAP_XMAX_IS_EXCL_LOCKED(infomask));
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_unlocked;
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
case LockTupleShare:
if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
HEAP_XMAX_IS_EXCL_LOCKED(infomask))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_unlocked;
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
break;
case LockTupleNoKeyExclusive:
if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_unlocked;
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
break;
case LockTupleExclusive:
if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
infomask2 & HEAP_KEYS_UPDATED)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_unlocked;
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
break;
}
}
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Initially assume that we will have to wait for the locking
* transaction(s) to finish. We check various cases below in which
* this can be turned off.
*/
require_sleep = true;
if (mode == LockTupleKeyShare)
{
/*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* If we're requesting KeyShare, and there's no update present, we
* don't need to wait. Even if there is an update, we can still
* continue if the key hasn't been modified.
*
* However, if there are updates, we need to walk the update chain
* to mark future versions of the row as locked, too. That way,
* if somebody deletes that future version, we're protected
* against the key going away. This locking of future versions
* could block momentarily, if a concurrent transaction is
* deleting a key; or it could return a value to the effect that
* the transaction deleting the key has already committed. So we
* do this before re-locking the buffer; otherwise this would be
* prone to deadlocks.
*
* Note that the TID we're locking was grabbed before we unlocked
* the buffer. For it to change while we're not looking, the
* other properties we're testing for below after re-locking the
* buffer would also change, in which case we would restart this
* loop above.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (!(infomask2 & HEAP_KEYS_UPDATED))
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
bool updated;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* If there are updates, follow the update chain; bail out if
* that cannot be done.
*/
if (follow_updates && updated)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result res;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
GetCurrentTransactionId(),
mode);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (res != TM_Ok)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
result = res;
/* recovery code expects to have buffer lock held */
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
goto failed;
}
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
/*
* Make sure it's still an appropriate lock, else start over.
* Also, if it wasn't updated before we released the lock, but
* is updated now, we start over too; the reason is that we
* now need to follow the update chain to lock the new
* versions.
*/
if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
!updated))
goto l3;
/* Things look okay, so we can skip sleeping */
require_sleep = false;
/*
* Note we allow Xmax to change here; other updaters/lockers
* could have modified it before we grabbed the buffer lock.
* However, this is not a problem, because with the recheck we
* just did we ensure that they still don't conflict with the
* lock we want.
*/
}
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
else if (mode == LockTupleShare)
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* If we're requesting Share, we can similarly avoid sleeping if
* there's no update and no exclusive lock present.
*/
if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
!HEAP_XMAX_IS_EXCL_LOCKED(infomask))
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Make sure it's still an appropriate lock, else start over.
* See above about allowing xmax to change.
*/
if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
goto l3;
require_sleep = false;
}
}
else if (mode == LockTupleNoKeyExclusive)
{
/*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* If we're requesting NoKeyExclusive, we might also be able to
* avoid sleeping; just ensure that there no conflicting lock
* already acquired.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (infomask & HEAP_XMAX_IS_MULTI)
{
if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
mode, NULL))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
/*
* No conflict, but if the xmax changed under us in the
* meantime, start over.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
xwait))
goto l3;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/* otherwise, we're good */
require_sleep = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
}
else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
{
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
/* if the xmax changed in the meantime, start over */
Fix race when updating a tuple concurrently locked by another process If a tuple is locked, and this lock is later upgraded either to an update or to a stronger lock, and in the meantime some other process tries to lock, update or delete the same tuple, it (the tuple) could end up being updated twice, or having conflicting locks held. The reason for this is that the second updater checks for a change in Xmax value, or in the HEAP_XMAX_IS_MULTI infomask bit, after noticing the first lock; and if there's a change, it restarts and re-evaluates its ability to update the tuple. But it neglected to check for changes in lock strength or in lock-vs-update status when those two properties stayed the same. This would lead it to take the wrong decision and continue with its own update, when in reality it shouldn't do so but instead restart from the top. This could lead to either an assertion failure much later (when a multixact containing multiple updates is detected), or duplicate copies of tuples. To fix, make sure to compare the other relevant infomask bits alongside the Xmax value and HEAP_XMAX_IS_MULTI bit, and restart from the top if necessary. Also, in the belt-and-suspenders spirit, add a check to MultiXactCreateFromMembers that a multixact being created does not have two or more members that are claimed to be updates. This should protect against other bugs that might cause similar bogus situations. Backpatch to 9.3, where the possibility of multixacts containing updates was introduced. (In prior versions it was possible to have the tuple lock upgraded from shared to exclusive, and an update would not restart from the top; yet we're protected against a bug there because there's always a sleep to wait for the locking transaction to complete before continuing to do anything. Really, the fact that tuple locks always conflicted with concurrent updates is what protected against bugs here.) Per report from Andrew Dunstan and Josh Berkus in thread at http://www.postgresql.org/message-id/534C8B33.9050807@pgexperts.com Bug analysis by Andres Freund.
2014-04-24 20:41:55 +02:00
if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
xwait))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
goto l3;
/* otherwise, we're good */
require_sleep = false;
}
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/*
* As a check independent from those above, we can also avoid sleeping
* if the current transaction is the sole locker of the tuple. Note
* that the strength of the lock already held is irrelevant; this is
* not about recording the lock in Xmax (which will be done regardless
* of this optimization, below). Also, note that the cases where we
* hold a lock stronger than we are requesting are already handled
* above by not doing anything.
*
* Note we only deal with the non-multixact case here; MultiXactIdWait
* is well equipped to deal with this situation on its own.
*/
if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
TransactionIdIsCurrentTransactionId(xwait))
{
/* ... but if the xmax changed in the meantime, start over */
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
xwait))
goto l3;
Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
require_sleep = false;
}
/*
Avoid serializability errors when locking a tuple with a committed update When key-share locking a tuple that has been not-key-updated, and the update is a committed transaction, in some cases we raised serializability errors: ERROR: could not serialize access due to concurrent update Because the key-share doesn't conflict with the update, the error is unnecessary and inconsistent with the case that the update hasn't committed yet. This causes problems for some usage patterns, even if it can be claimed that it's sufficient to retry the aborted transaction: given a steady stream of updating transactions and a long locking transaction, the long transaction can be starved indefinitely despite multiple retries. To fix, we recognize that HeapTupleSatisfiesUpdate can return HeapTupleUpdated when an updating transaction has committed, and that we need to deal with that case exactly as if it were a non-committed update: verify whether the two operations conflict, and if not, carry on normally. If they do conflict, however, there is a difference: in the HeapTupleBeingUpdated case we can just sleep until the concurrent transaction is gone, while in the HeapTupleUpdated case this is not possible and we must raise an error instead. Per trouble report from Olivier Dony. In addition to a couple of test cases that verify the changed behavior, I added a test case to verify the behavior that remains unchanged, namely that errors are raised when a update that modifies the key is used. That must still generate serializability errors. One pre-existing test case changes behavior; per discussion, the new behavior is actually the desired one. Discussion: https://www.postgresql.org/message-id/560AA479.4080807@odoo.com https://www.postgresql.org/message-id/20151014164844.3019.25750@wrigleys.postgresql.org Backpatch to 9.3, where the problem appeared.
2016-07-15 20:17:20 +02:00
* Time to sleep on the other transaction/multixact, if necessary.
*
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* If the other transaction is an update/delete that's already
* committed, then sleeping cannot possibly do any good: if we're
* required to sleep, get out to raise an error instead.
Avoid serializability errors when locking a tuple with a committed update When key-share locking a tuple that has been not-key-updated, and the update is a committed transaction, in some cases we raised serializability errors: ERROR: could not serialize access due to concurrent update Because the key-share doesn't conflict with the update, the error is unnecessary and inconsistent with the case that the update hasn't committed yet. This causes problems for some usage patterns, even if it can be claimed that it's sufficient to retry the aborted transaction: given a steady stream of updating transactions and a long locking transaction, the long transaction can be starved indefinitely despite multiple retries. To fix, we recognize that HeapTupleSatisfiesUpdate can return HeapTupleUpdated when an updating transaction has committed, and that we need to deal with that case exactly as if it were a non-committed update: verify whether the two operations conflict, and if not, carry on normally. If they do conflict, however, there is a difference: in the HeapTupleBeingUpdated case we can just sleep until the concurrent transaction is gone, while in the HeapTupleUpdated case this is not possible and we must raise an error instead. Per trouble report from Olivier Dony. In addition to a couple of test cases that verify the changed behavior, I added a test case to verify the behavior that remains unchanged, namely that errors are raised when a update that modifies the key is used. That must still generate serializability errors. One pre-existing test case changes behavior; per discussion, the new behavior is actually the desired one. Discussion: https://www.postgresql.org/message-id/560AA479.4080807@odoo.com https://www.postgresql.org/message-id/20151014164844.3019.25750@wrigleys.postgresql.org Backpatch to 9.3, where the problem appeared.
2016-07-15 20:17:20 +02:00
*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* By here, we either have already acquired the buffer exclusive lock,
* or we must wait for the locking transaction or multixact; so below
* we ensure that we grab buffer lock after the sleep.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (require_sleep && (result == TM_Updated || result == TM_Deleted))
Avoid serializability errors when locking a tuple with a committed update When key-share locking a tuple that has been not-key-updated, and the update is a committed transaction, in some cases we raised serializability errors: ERROR: could not serialize access due to concurrent update Because the key-share doesn't conflict with the update, the error is unnecessary and inconsistent with the case that the update hasn't committed yet. This causes problems for some usage patterns, even if it can be claimed that it's sufficient to retry the aborted transaction: given a steady stream of updating transactions and a long locking transaction, the long transaction can be starved indefinitely despite multiple retries. To fix, we recognize that HeapTupleSatisfiesUpdate can return HeapTupleUpdated when an updating transaction has committed, and that we need to deal with that case exactly as if it were a non-committed update: verify whether the two operations conflict, and if not, carry on normally. If they do conflict, however, there is a difference: in the HeapTupleBeingUpdated case we can just sleep until the concurrent transaction is gone, while in the HeapTupleUpdated case this is not possible and we must raise an error instead. Per trouble report from Olivier Dony. In addition to a couple of test cases that verify the changed behavior, I added a test case to verify the behavior that remains unchanged, namely that errors are raised when a update that modifies the key is used. That must still generate serializability errors. One pre-existing test case changes behavior; per discussion, the new behavior is actually the desired one. Discussion: https://www.postgresql.org/message-id/560AA479.4080807@odoo.com https://www.postgresql.org/message-id/20151014164844.3019.25750@wrigleys.postgresql.org Backpatch to 9.3, where the problem appeared.
2016-07-15 20:17:20 +02:00
{
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
goto failed;
}
else if (require_sleep)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
/*
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
* Acquire tuple lock to establish our priority for the tuple, or
* die trying. LockTuple will release us when we are next-in-line
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
* for the tuple. We must do this even if we are share-locking,
* but not if we already have a weaker lock on the tuple.
*
* If we are forced to "start over" below, we keep the tuple lock;
* this arranges that we stay at the head of the line while
* rechecking tuple state.
*/
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
if (!skip_tuple_lock &&
!heap_acquire_tuplock(relation, tid, mode, wait_policy,
&have_tuple_lock))
{
/*
* This can only happen if wait_policy is Skip and the lock
* couldn't be obtained.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_WouldBlock;
/* recovery code expects to have buffer lock held */
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
goto failed;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (infomask & HEAP_XMAX_IS_MULTI)
{
MultiXactStatus status = get_mxact_status_for_lock(mode, false);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/* We only ever lock tuples, never update them */
if (status >= MultiXactStatusNoKeyUpdate)
elog(ERROR, "invalid lock mode in heap_lock_tuple");
/* wait for multixact to end, or die trying */
switch (wait_policy)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
case LockWaitBlock:
MultiXactIdWait((MultiXactId) xwait, status, infomask,
relation, &tuple->t_self, XLTW_Lock, NULL);
break;
case LockWaitSkip:
if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
status, infomask, relation,
NULL))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_WouldBlock;
/* recovery code expects to have buffer lock held */
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
goto failed;
}
break;
case LockWaitError:
if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
status, infomask, relation,
NULL))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
RelationGetRelationName(relation))));
break;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* Of course, the multixact might not be done here: if we're
* requesting a light lock mode, other transactions with light
* locks could still be alive, as well as locks owned by our
* own xact or other subxacts of this backend. We need to
* preserve the surviving MultiXact members. Note that it
* isn't absolutely necessary in the latter case, but doing so
* is simpler.
*/
}
else
{
/* wait for regular transaction to end, or die trying */
switch (wait_policy)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
case LockWaitBlock:
XactLockTableWait(xwait, relation, &tuple->t_self,
XLTW_Lock);
break;
case LockWaitSkip:
if (!ConditionalXactLockTableWait(xwait))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_WouldBlock;
/* recovery code expects to have buffer lock held */
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
goto failed;
}
break;
case LockWaitError:
if (!ConditionalXactLockTableWait(xwait))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
RelationGetRelationName(relation))));
break;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/* if there are updates, follow the update chain */
if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result res;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
GetCurrentTransactionId(),
mode);
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (res != TM_Ok)
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
{
result = res;
/* recovery code expects to have buffer lock held */
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
goto failed;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
/*
* xwait is done, but if xwait had just locked the tuple then some
* other xact could update this tuple before we get to this point.
* Check for xmax change, and start over if so.
*/
if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
!TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
xwait))
goto l3;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
if (!(infomask & HEAP_XMAX_IS_MULTI))
{
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Otherwise check if it committed or aborted. Note we cannot
* be here if the tuple was only locked by somebody who didn't
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
* conflict with us; that would have been handled above. So
* that transaction must necessarily be gone by now. But
* don't check for this in the multixact case, because some
* locker transactions might still be running.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
}
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/* By here, we're certain that we hold buffer exclusive lock again */
/*
* We may lock if previous xmax aborted, or if it committed but only
* locked the tuple without updating it; or if we didn't have to wait
* at all for whatever reason.
*/
if (!require_sleep ||
(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
HeapTupleHeaderIsOnlyLocked(tuple->t_data))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Updated;
else
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Deleted;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
failed:
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result != TM_Ok)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(result == TM_SelfModified || result == TM_Updated ||
result == TM_Deleted || result == TM_WouldBlock);
/*
* When locking a tuple under LockWaitSkip semantics and we fail with
* TM_WouldBlock above, it's possible for concurrent transactions to
* release the lock and set HEAP_XMAX_INVALID in the meantime. So
* this assert is slightly different from the equivalent one in
* heap_delete and heap_update.
*/
2022-01-04 17:21:23 +01:00
Assert((result == TM_WouldBlock) ||
!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(result != TM_Updated ||
!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
tmfd->ctid = tuple->t_data->t_ctid;
tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
if (result == TM_SelfModified)
tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
else
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
tmfd->cmax = InvalidCommandId;
goto out_locked;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/*
* If we didn't pin the visibility map page and the page has become all
* visible while we were busy locking the buffer, or during some
* subsequent window during which we had it unlocked, we'll have to unlock
* and re-lock, to avoid holding the buffer lock across I/O. That's a bit
* unfortunate, especially since we'll now have to recheck whether the
* tuple has been locked or updated under us, but hopefully it won't
* happen very often.
*/
if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
{
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
visibilitymap_pin(relation, block, &vmbuffer);
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
goto l3;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
old_infomask = tuple->t_data->t_infomask;
/*
* If this is the first possibly-multixact-able operation in the current
* transaction, set my per-backend OldestMemberMXactId setting. We can be
* certain that the transaction will never become a member of any older
* MultiXactIds than that. (We have to do this even if we end up just
* using our own TransactionId below, since some other backend could
* incorporate our XID into a MultiXact immediately afterwards.)
*/
MultiXactIdSetOldestMember();
/*
* Compute the new xmax and infomask to store into the tuple. Note we do
* not modify the tuple just yet, because that would leave it in the wrong
* state if multixact.c elogs.
*/
compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
GetCurrentTransactionId(), mode, false,
&xid, &new_infomask, &new_infomask2);
START_CRIT_SECTION();
2000-07-03 04:54:21 +02:00
/*
* Store transaction information of xact locking the tuple.
*
* Note: Cmax is meaningless in this context, so don't set it; this avoids
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* possibly generating a useless combo CID. Moreover, if we're locking a
* previously updated tuple, it's important to preserve the Cmax.
*
* Also reset the HOT UPDATE bit, but only if there's no update; otherwise
* we would break the HOT chain.
*/
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
tuple->t_data->t_infomask |= new_infomask;
tuple->t_data->t_infomask2 |= new_infomask2;
if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
HeapTupleHeaderClearHotUpdated(tuple->t_data);
HeapTupleHeaderSetXmax(tuple->t_data, xid);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Make sure there is no forward chain link in t_ctid. Note that in the
* cases where the tuple has been updated, we must not overwrite t_ctid,
* because it was set by the updater. Moreover, if the tuple has been
* updated, we need to follow the update chain to lock the new versions of
* the tuple as well.
*/
if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
tuple->t_data->t_ctid = *tid;
/* Clear only the all-frozen bit on visibility map if needed */
if (PageIsAllVisible(page) &&
visibilitymap_clear(relation, block, vmbuffer,
VISIBILITYMAP_ALL_FROZEN))
cleared_all_frozen = true;
MarkBufferDirty(*buffer);
/*
* XLOG stuff. You might think that we don't need an XLOG record because
* there is no state change worth restoring after a crash. You would be
* wrong however: we have just written either a TransactionId or a
* MultiXactId that may never have been seen on disk before, and we need
* to make sure that there are XLOG entries covering those ID numbers.
* Else the same IDs might be re-used after a crash, which would be
* disastrous if this page made it to disk before the crash. Essentially
* we have to enforce the WAL log-before-data rule even in this case.
* (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
* entries for everything anyway.)
*/
if (RelationNeedsWAL(relation))
{
xl_heap_lock xlrec;
XLogRecPtr recptr;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
xlrec.xmax = xid;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xlrec.infobits_set = compute_infobits(new_infomask,
tuple->t_data->t_infomask2);
xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/* we don't decode row locks atm, so no need to log the origin */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
out_locked:
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
out_unlocked:
if (BufferIsValid(vmbuffer))
ReleaseBuffer(vmbuffer);
/*
* Don't update the visibility map here. Locking a tuple doesn't change
* visibility info.
*/
/*
* Now that we have successfully marked the tuple as locked, we can
* release the lmgr tuple lock, if we had it.
*/
if (have_tuple_lock)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
UnlockTupleTuplock(relation, tid, mode);
return result;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/*
* Acquire heavyweight lock on the given tuple, in preparation for acquiring
* its normal, Xmax-based tuple lock.
*
* have_tuple_lock is an input and output parameter: on input, it indicates
* whether the lock has previously been acquired (and this function does
* nothing in that case). If this function returns success, have_tuple_lock
* has been flipped to true.
*
* Returns false if it was unable to obtain the lock; this can only happen if
* wait_policy is Skip.
*/
static bool
heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
LockWaitPolicy wait_policy, bool *have_tuple_lock)
{
if (*have_tuple_lock)
return true;
switch (wait_policy)
{
case LockWaitBlock:
LockTupleTuplock(relation, tid, mode);
break;
case LockWaitSkip:
if (!ConditionalLockTupleTuplock(relation, tid, mode))
return false;
break;
case LockWaitError:
if (!ConditionalLockTupleTuplock(relation, tid, mode))
ereport(ERROR,
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
errmsg("could not obtain lock on row in relation \"%s\"",
RelationGetRelationName(relation))));
break;
}
*have_tuple_lock = true;
return true;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Given an original set of Xmax and infomask, and a transaction (identified by
* add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
* corresponding infomasks to use on the tuple.
*
* Note that this might have side effects such as creating a new MultiXactId.
*
* Most callers will have called HeapTupleSatisfiesUpdate before this function;
* that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
* but it was not running anymore. There is a race condition, which is that the
* MultiXactId may have finished since then, but that uncommon case is handled
* either here, or within MultiXactIdExpand.
*
* There is a similar race condition possible when the old xmax was a regular
* TransactionId. We test TransactionIdIsInProgress again just to narrow the
* window, but it's still possible to end up creating an unnecessary
* MultiXactId. Fortunately this is harmless.
*/
static void
compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
uint16 old_infomask2, TransactionId add_to_xmax,
LockTupleMode mode, bool is_update,
TransactionId *result_xmax, uint16 *result_infomask,
uint16 *result_infomask2)
{
TransactionId new_xmax;
uint16 new_infomask,
new_infomask2;
Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
l5:
new_infomask = 0;
new_infomask2 = 0;
if (old_infomask & HEAP_XMAX_INVALID)
{
/*
* No previous locker; we just insert our own TransactionId.
*
* Note that it's critical that this case be the first one checked,
* because there are several blocks below that come back to this one
* to implement certain optimizations; old_infomask might contain
* other dirty bits in those cases, but we don't really care.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
if (is_update)
{
new_xmax = add_to_xmax;
if (mode == LockTupleExclusive)
new_infomask2 |= HEAP_KEYS_UPDATED;
}
else
{
new_infomask |= HEAP_XMAX_LOCK_ONLY;
switch (mode)
{
case LockTupleKeyShare:
new_xmax = add_to_xmax;
new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
break;
case LockTupleShare:
new_xmax = add_to_xmax;
new_infomask |= HEAP_XMAX_SHR_LOCK;
break;
case LockTupleNoKeyExclusive:
new_xmax = add_to_xmax;
new_infomask |= HEAP_XMAX_EXCL_LOCK;
break;
case LockTupleExclusive:
new_xmax = add_to_xmax;
new_infomask |= HEAP_XMAX_EXCL_LOCK;
new_infomask2 |= HEAP_KEYS_UPDATED;
break;
default:
new_xmax = InvalidTransactionId; /* silence compiler */
elog(ERROR, "invalid lock mode");
}
}
}
else if (old_infomask & HEAP_XMAX_IS_MULTI)
{
MultiXactStatus new_status;
/*
* Currently we don't allow XMAX_COMMITTED to be set for multis, so
* cross-check.
*/
Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
/*
* A multixact together with LOCK_ONLY set but neither lock bit set
* (i.e. a pg_upgraded share locked tuple) cannot possibly be running
* anymore. This check is critical for databases upgraded by
* pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
* that such multis are never passed.
*/
Fix handling of multixacts predating pg_upgrade After pg_upgrade, it is possible that some tuples' Xmax have multixacts corresponding to the old installation; such multixacts cannot have running members anymore. In many code sites we already know not to read them and clobber them silently, but at least when VACUUM tries to freeze a multixact or determine whether one needs freezing, there's an attempt to resolve it to its member transactions by calling GetMultiXactIdMembers, and if the multixact value is "in the future" with regards to the current valid multixact range, an error like this is raised: ERROR: MultiXactId 123 has not been created yet -- apparent wraparound and vacuuming fails. Per discussion with Andrew Gierth, it is completely bogus to try to resolve multixacts coming from before a pg_upgrade, regardless of where they stand with regards to the current valid multixact range. It's possible to get from under this problem by doing SELECT FOR UPDATE of the problem tuples, but if tables are large, this is slow and tedious, so a more thorough solution is desirable. To fix, we realize that multixacts in xmax created in 9.2 and previous have a specific bit pattern that is never used in 9.3 and later (we already knew this, per comments and infomask tests sprinkled in various places, but we weren't leveraging this knowledge appropriately). Whenever the infomask of the tuple matches that bit pattern, we just ignore the multixact completely as if Xmax wasn't set; or, in the case of tuple freezing, we act as if an unwanted value is set and clobber it without decoding. This guarantees that no errors will be raised, and that the values will be progressively removed until all tables are clean. Most callers of GetMultiXactIdMembers are patched to recognize directly that the value is a removable "empty" multixact and avoid calling GetMultiXactIdMembers altogether. To avoid changing the signature of GetMultiXactIdMembers() in back branches, we keep the "allow_old" boolean flag but rename it to "from_pgupgrade"; if the flag is true, we always return an empty set instead of looking up the multixact. (I suppose we could remove the argument in the master branch, but I chose not to do so in this commit). This was broken all along, but the error-facing message appeared first because of commit 8e9a16ab8f7f and was partially fixed in a25c2b7c4db3. This fix, backpatched all the way back to 9.3, goes approximately in the same direction as a25c2b7c4db3 but should cover all cases. Bug analysis by Andrew Gierth and Álvaro Herrera. A number of public reports match this bug: https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-25 00:29:28 +02:00
if (HEAP_LOCKED_UPGRADED(old_infomask))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
old_infomask &= ~HEAP_XMAX_IS_MULTI;
old_infomask |= HEAP_XMAX_INVALID;
goto l5;
}
/*
* If the XMAX is already a MultiXactId, then we need to expand it to
* include add_to_xmax; but if all the members were lockers and are
* all gone, we can do away with the IS_MULTI bit and just set
* add_to_xmax as the only locker/updater. If all lockers are gone
* and we have an updater that aborted, we can also do without a
* multi.
*
* The cost of doing GetMultiXactIdMembers would be paid by
* MultiXactIdExpand if we weren't to do this, so this check is not
* incurring extra work anyhow.
*/
if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
Optimize locking a tuple already locked by another subxact Locking and updating the same tuple repeatedly led to some strange multixacts being created which had several subtransactions of the same parent transaction holding locks of the same strength. However, once a subxact of the current transaction holds a lock of a given strength, it's not necessary to acquire the same lock again. This made some coding patterns much slower than required. The fix is twofold. First we change HeapTupleSatisfiesUpdate to return HeapTupleBeingUpdated for the case where the current transaction is already a single-xid locker for the given tuple; it used to return HeapTupleMayBeUpdated for that case. The new logic is simpler, and the change to pgrowlocks is a testament to that: previously we needed to check for the single-xid locker separately in a very ugly way. That test is simpler now. As fallout from the HTSU change, some of its callers need to be amended so that tuple-locked-by-own-transaction is taken into account in the BeingUpdated case rather than the MayBeUpdated case. For many of them there is no difference; but heap_delete() and heap_update now check explicitely and do not grab tuple lock in that case. The HTSU change also means that routine MultiXactHasRunningRemoteMembers introduced in commit 11ac4c73cb895 is no longer necessary and can be removed; the case that used to require it is now handled naturally as result of the changes to heap_delete and heap_update. The second part of the fix to the performance issue is to adjust heap_lock_tuple to avoid the slowness: 1. Previously we checked for the case that our own transaction already held a strong enough lock and returned MayBeUpdated, but only in the multixact case. Now we do it for the plain Xid case as well, which saves having to LockTuple. 2. If the current transaction is the only locker of the tuple (but with a lock not as strong as what we need; otherwise it would have been caught in the check mentioned above), we can skip sleeping on the multixact, and instead go straight to create an updated multixact with the additional lock strength. 3. Most importantly, make sure that both the single-xid-locker case and the multixact-locker case optimization are applied always. We do this by checking both in a single place, rather than them appearing in two separate portions of the routine -- something that is made possible by the HeapTupleSatisfiesUpdate API change. Previously we would only check for the single-xid case when HTSU returned MayBeUpdated, and only checked for the multixact case when HTSU returned BeingUpdated. This was at odds with what HTSU actually returned in one case: if our own transaction was locker in a multixact, it returned MayBeUpdated, so the optimization never applied. This is what led to the large multixacts in the first place. Per bug report #8470 by Oskari Saarenmaa.
2015-04-10 18:47:15 +02:00
!TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
old_infomask)))
{
/*
* Reset these bits and restart; otherwise fall through to
* create a new multi below.
*/
old_infomask &= ~HEAP_XMAX_IS_MULTI;
old_infomask |= HEAP_XMAX_INVALID;
goto l5;
}
}
new_status = get_mxact_status_for_lock(mode, is_update);
new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
new_status);
GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
}
else if (old_infomask & HEAP_XMAX_COMMITTED)
{
/*
* It's a committed update, so we need to preserve him as updater of
* the tuple.
*/
MultiXactStatus status;
MultiXactStatus new_status;
if (old_infomask2 & HEAP_KEYS_UPDATED)
status = MultiXactStatusUpdate;
else
status = MultiXactStatusNoKeyUpdate;
new_status = get_mxact_status_for_lock(mode, is_update);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* since it's not running, it's obviously impossible for the old
* updater to be identical to the current one, so we need not check
* for that case as we do in the block above.
*/
new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
}
else if (TransactionIdIsInProgress(xmax))
{
/*
* If the XMAX is a valid, in-progress TransactionId, then we need to
* create a new MultiXactId that includes both the old locker or
* updater and our own TransactionId.
*/
MultiXactStatus new_status;
MultiXactStatus old_status;
LockTupleMode old_mode;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
{
if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
old_status = MultiXactStatusForKeyShare;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
old_status = MultiXactStatusForShare;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
{
if (old_infomask2 & HEAP_KEYS_UPDATED)
old_status = MultiXactStatusForUpdate;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
else
old_status = MultiXactStatusForNoKeyUpdate;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
else
{
/*
* LOCK_ONLY can be present alone only when a page has been
* upgraded by pg_upgrade. But in that case,
* TransactionIdIsInProgress() should have returned false. We
* assume it's no longer locked in this case.
*/
elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
old_infomask |= HEAP_XMAX_INVALID;
old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
goto l5;
}
}
else
{
/* it's an update, but which kind? */
if (old_infomask2 & HEAP_KEYS_UPDATED)
old_status = MultiXactStatusUpdate;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
else
old_status = MultiXactStatusNoKeyUpdate;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
old_mode = TUPLOCK_from_mxstatus(old_status);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* If the lock to be acquired is for the same TransactionId as the
* existing lock, there's an optimization possible: consider only the
* strongest of both locks as the only one present, and restart.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
if (xmax == add_to_xmax)
{
/*
* Note that it's not possible for the original tuple to be
* updated: we wouldn't be here because the tuple would have been
* invisible and we wouldn't try to update it. As a subtlety,
* this code can also run when traversing an update chain to lock
* future versions of a tuple. But we wouldn't be here either,
* because the add_to_xmax would be different from the original
* updater.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
/* acquire the strongest of both */
if (mode < old_mode)
mode = old_mode;
/* mustn't touch is_update */
old_infomask |= HEAP_XMAX_INVALID;
goto l5;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/* otherwise, just fall back to creating a new multixact */
new_status = get_mxact_status_for_lock(mode, is_update);
new_xmax = MultiXactIdCreate(xmax, old_status,
add_to_xmax, new_status);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
}
else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
TransactionIdDidCommit(xmax))
{
/*
* It's a committed update, so we gotta preserve him as updater of the
* tuple.
*/
MultiXactStatus status;
MultiXactStatus new_status;
if (old_infomask2 & HEAP_KEYS_UPDATED)
status = MultiXactStatusUpdate;
else
status = MultiXactStatusNoKeyUpdate;
new_status = get_mxact_status_for_lock(mode, is_update);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* since it's not running, it's obviously impossible for the old
* updater to be identical to the current one, so we need not check
* for that case as we do in the block above.
*/
new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
}
else
{
/*
* Can get here iff the locking/updating transaction was running when
* the infomask was extracted from the tuple, but finished before
* TransactionIdIsInProgress got to run. Deal with it as if there was
* no locker at all in the first place.
*/
old_infomask |= HEAP_XMAX_INVALID;
goto l5;
}
*result_infomask = new_infomask;
*result_infomask2 = new_infomask2;
*result_xmax = new_xmax;
}
/*
* Subroutine for heap_lock_updated_tuple_rec.
*
* Given a hypothetical multixact status held by the transaction identified
* with the given xid, does the current transaction need to wait, fail, or can
* it continue if it wanted to acquire a lock of the given mode? "needwait"
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
* is set to true if waiting is necessary; if it can continue, then TM_Ok is
* returned. If the lock is already held by the current transaction, return
* TM_SelfModified. In case of a conflict with another transaction, a
* different HeapTupleSatisfiesUpdate return code is returned.
*
* The held status is said to be hypothetical because it might correspond to a
* lock held by a single Xid, i.e. not a real MultiXactId; we express it this
* way for simplicity of API.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
static TM_Result
test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
LockTupleMode mode, HeapTuple tup,
bool *needwait)
{
MultiXactStatus wantedstatus;
*needwait = false;
wantedstatus = get_mxact_status_for_lock(mode, false);
/*
* Note: we *must* check TransactionIdIsInProgress before
* TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
* for an explanation.
*/
if (TransactionIdIsCurrentTransactionId(xid))
{
/*
* The tuple has already been locked by our own transaction. This is
* very rare but can happen if multiple transactions are trying to
* lock an ancient version of the same tuple.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_SelfModified;
}
else if (TransactionIdIsInProgress(xid))
{
/*
* If the locking transaction is running, what we do depends on
* whether the lock modes conflict: if they do, then we must wait for
* it to finish; otherwise we can fall through to lock this tuple
* version without waiting.
*/
if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
LOCKMODE_from_mxstatus(wantedstatus)))
{
*needwait = true;
}
/*
* If we set needwait above, then this value doesn't matter;
* otherwise, this value signals to caller that it's okay to proceed.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
}
else if (TransactionIdDidAbort(xid))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
else if (TransactionIdDidCommit(xid))
{
/*
Fix improper abort during update chain locking In 247c76a98909, I added some code to do fine-grained checking of MultiXact status of locking/updating transactions when traversing an update chain. There was a thinko in that patch which would have the traversing abort, that is return HeapTupleUpdated, when the other transaction is a committed lock-only. In this case we should ignore it and return success instead. Of course, in the case where there is a committed update, HeapTupleUpdated is the correct return value. A user-visible symptom of this bug is that in REPEATABLE READ and SERIALIZABLE transaction isolation modes spurious serializability errors can occur: ERROR: could not serialize access due to concurrent update In order for this to happen, there needs to be a tuple that's key-share- locked and also updated, and the update must abort; a subsequent transaction trying to acquire a new lock on that tuple would abort with the above error. The reason is that the initial FOR KEY SHARE is seen as committed by the new locking transaction, which triggers this bug. (If the UPDATE commits, then the serialization error is correctly reported.) When running a query in READ COMMITTED mode, what happens is that the locking is aborted by the HeapTupleUpdated return value, then EvalPlanQual fetches the newest version of the tuple, which is then the only version that gets locked. (The second time the tuple is checked there is no misbehavior on the committed lock-only, because it's not checked by the code that traverses update chains; so no bug.) Only the newest version of the tuple is locked, not older ones, but this is harmless. The isolation test added by this commit illustrates the desired behavior, including the proper serialization errors that get thrown. Backpatch to 9.3.
2013-12-05 21:47:51 +01:00
* The other transaction committed. If it was only a locker, then the
* lock is completely gone now and we can return success; but if it
* was an update, then what we do depends on whether the two lock
* modes conflict. If they conflict, then we must report error to
* caller. But if they don't, we can fall through to allow the current
* transaction to lock the tuple.
*
* Note: the reason we worry about ISUPDATE here is because as soon as
* a transaction ends, all its locks are gone and meaningless, and
* thus we can ignore them; whereas its updates persist. In the
* TransactionIdIsInProgress case, above, we don't need to check
* because we know the lock is still "alive" and thus a conflict needs
* always be checked.
*/
Fix improper abort during update chain locking In 247c76a98909, I added some code to do fine-grained checking of MultiXact status of locking/updating transactions when traversing an update chain. There was a thinko in that patch which would have the traversing abort, that is return HeapTupleUpdated, when the other transaction is a committed lock-only. In this case we should ignore it and return success instead. Of course, in the case where there is a committed update, HeapTupleUpdated is the correct return value. A user-visible symptom of this bug is that in REPEATABLE READ and SERIALIZABLE transaction isolation modes spurious serializability errors can occur: ERROR: could not serialize access due to concurrent update In order for this to happen, there needs to be a tuple that's key-share- locked and also updated, and the update must abort; a subsequent transaction trying to acquire a new lock on that tuple would abort with the above error. The reason is that the initial FOR KEY SHARE is seen as committed by the new locking transaction, which triggers this bug. (If the UPDATE commits, then the serialization error is correctly reported.) When running a query in READ COMMITTED mode, what happens is that the locking is aborted by the HeapTupleUpdated return value, then EvalPlanQual fetches the newest version of the tuple, which is then the only version that gets locked. (The second time the tuple is checked there is no misbehavior on the committed lock-only, because it's not checked by the code that traverses update chains; so no bug.) Only the newest version of the tuple is locked, not older ones, but this is harmless. The isolation test added by this commit illustrates the desired behavior, including the proper serialization errors that get thrown. Backpatch to 9.3.
2013-12-05 21:47:51 +01:00
if (!ISUPDATE_from_mxstatus(status))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
Fix improper abort during update chain locking In 247c76a98909, I added some code to do fine-grained checking of MultiXact status of locking/updating transactions when traversing an update chain. There was a thinko in that patch which would have the traversing abort, that is return HeapTupleUpdated, when the other transaction is a committed lock-only. In this case we should ignore it and return success instead. Of course, in the case where there is a committed update, HeapTupleUpdated is the correct return value. A user-visible symptom of this bug is that in REPEATABLE READ and SERIALIZABLE transaction isolation modes spurious serializability errors can occur: ERROR: could not serialize access due to concurrent update In order for this to happen, there needs to be a tuple that's key-share- locked and also updated, and the update must abort; a subsequent transaction trying to acquire a new lock on that tuple would abort with the above error. The reason is that the initial FOR KEY SHARE is seen as committed by the new locking transaction, which triggers this bug. (If the UPDATE commits, then the serialization error is correctly reported.) When running a query in READ COMMITTED mode, what happens is that the locking is aborted by the HeapTupleUpdated return value, then EvalPlanQual fetches the newest version of the tuple, which is then the only version that gets locked. (The second time the tuple is checked there is no misbehavior on the committed lock-only, because it's not checked by the code that traverses update chains; so no bug.) Only the newest version of the tuple is locked, not older ones, but this is harmless. The isolation test added by this commit illustrates the desired behavior, including the proper serialization errors that get thrown. Backpatch to 9.3.
2013-12-05 21:47:51 +01:00
if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
LOCKMODE_from_mxstatus(wantedstatus)))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
{
/* bummer */
if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Updated;
else
return TM_Deleted;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
}
/* Not in progress, not aborted, not committed -- must have crashed */
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Recursive part of heap_lock_updated_tuple
*
* Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
* xid with the given mode; if this tuple is updated, recurse to lock the new
* version as well.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
static TM_Result
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
LockTupleMode mode)
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
TM_Result result;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ItemPointerData tupid;
HeapTupleData mytup;
Buffer buf;
uint16 new_infomask,
new_infomask2,
old_infomask,
old_infomask2;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
TransactionId xmax,
new_xmax;
TransactionId priorXmax = InvalidTransactionId;
bool cleared_all_frozen = false;
bool pinned_desired_page;
Buffer vmbuffer = InvalidBuffer;
BlockNumber block;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ItemPointerCopy(tid, &tupid);
for (;;)
{
new_infomask = 0;
new_xmax = InvalidTransactionId;
block = ItemPointerGetBlockNumber(&tupid);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ItemPointerCopy(&tupid, &(mytup.t_self));
Prevent access to no-longer-pinned buffer in heapam_tuple_lock(). heap_fetch() used to have a "keep_buf" parameter that told it to return ownership of the buffer pin to the caller after finding that the requested tuple TID exists but is invisible to the specified snapshot. This was thoughtlessly removed in commit 5db6df0c0, which broke heapam_tuple_lock() (formerly EvalPlanQualFetch) because that function needs to do more accesses to the tuple even if it's invisible. The net effect is that we would continue to touch the page for a microsecond or two after releasing pin on the buffer. Usually no harm would result; but if a different session decided to defragment the page concurrently, we could see garbage data and mistakenly conclude that there's no newer tuple version to chain up to. (It's hard to say whether this has happened in the field. The bug was actually found thanks to a later change that allowed valgrind to detect accesses to non-pinned buffers.) The most reasonable way to fix this is to reintroduce keep_buf, although I made it behave slightly differently: buffer ownership is passed back only if there is a valid tuple at the requested TID. In HEAD, we can just add the parameter back to heap_fetch(). To avoid an API break in the back branches, introduce an additional function heap_fetch_extended() in those branches. In HEAD there is an additional, less obvious API change: tuple->t_data will be set to NULL in all cases where buffer ownership is not returned, in particular when the tuple exists but fails the time qual (and !keep_buf). This is to defend against any other callers attempting to access non-pinned buffers. We concluded that making that change in back branches would be more likely to introduce problems than cure any. In passing, remove a comment about heap_fetch that was obsoleted by 9a8ee1dc6. Per bug #17462 from Daniil Anisimov. Back-patch to v12 where the bug was introduced. Discussion: https://postgr.es/m/17462-9c98a0f00df9bd36@postgresql.org
2022-04-13 19:35:02 +02:00
if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
{
/*
* if we fail to find the updated version of the tuple, it's
* because it was vacuumed/pruned away after its creator
* transaction aborted. So behave as if we got to the end of the
* chain, and there's no further tuple to lock: return success to
* caller.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_unlocked;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
l4:
CHECK_FOR_INTERRUPTS();
/*
* Before locking the buffer, pin the visibility map page if it
* appears to be necessary. Since we haven't got the lock yet,
* someone else might be in the middle of changing this, so we'll need
* to recheck after we have the lock.
*/
if (PageIsAllVisible(BufferGetPage(buf)))
{
visibilitymap_pin(rel, block, &vmbuffer);
pinned_desired_page = true;
}
else
pinned_desired_page = false;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
/*
* If we didn't pin the visibility map page and the page has become
* all visible while we were busy locking the buffer, we'll have to
* unlock and re-lock, to avoid holding the buffer lock across I/O.
* That's a bit unfortunate, but hopefully shouldn't happen often.
*
* Note: in some paths through this function, we will reach here
* holding a pin on a vm page that may or may not be the one matching
* this page. If this page isn't all-visible, we won't use the vm
* page, but we hold onto such a pin till the end of the function.
*/
if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
{
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
visibilitymap_pin(rel, block, &vmbuffer);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
}
/*
* Check the tuple XMIN against prior XMAX, if any. If we reached the
* end of the chain, we're done, so return success.
*/
if (TransactionIdIsValid(priorXmax) &&
!TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
priorXmax))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_locked;
}
Fix locking a tuple updated by an aborted (sub)transaction When heap_lock_tuple decides to follow the update chain, it tried to also lock any version of the tuple that was created by an update that was subsequently rolled back. This is pointless, since for all intents and purposes that tuple exists no more; and moreover it causes misbehavior, as reported independently by Marko Tiikkaja and Marti Raudsepp: some SELECT FOR UPDATE/SHARE queries may fail to return the tuples, and assertion-enabled builds crash. Fix by having heap_lock_updated_tuple test the xmin and return success immediately if the tuple was created by an aborted transaction. The condition where tuples become invisible occurs when an updated tuple chain is followed by heap_lock_updated_tuple, which reports the problem as HeapTupleSelfUpdated to its caller heap_lock_tuple, which in turn propagates that code outwards possibly leading the calling code (ExecLockRows) to believe that the tuple exists no longer. Backpatch to 9.3. Only on 9.5 and newer this leads to a visible failure, because of commit 27846f02c176; before that, heap_lock_tuple skips the whole dance when the tuple is already locked by the same transaction, because of the ancient HeapTupleSatisfiesUpdate behavior. Still, the buggy condition may also exist in more convoluted scenarios involving concurrent transactions, so it seems safer to fix the bug in the old branches too. Discussion: https://www.postgresql.org/message-id/CABRT9RC81YUf1=jsmWopcKJEro=VoeG2ou6sPwyOUTx_qteRsg@mail.gmail.com https://www.postgresql.org/message-id/48d3eade-98d3-8b9a-477e-1a8dc32a724d@joh.to
2016-09-09 20:54:29 +02:00
/*
* Also check Xmin: if this tuple was created by an aborted
* (sub)transaction, then we already locked the last live one in the
* chain, thus we're done, so return success.
*/
if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_locked;
Fix locking a tuple updated by an aborted (sub)transaction When heap_lock_tuple decides to follow the update chain, it tried to also lock any version of the tuple that was created by an update that was subsequently rolled back. This is pointless, since for all intents and purposes that tuple exists no more; and moreover it causes misbehavior, as reported independently by Marko Tiikkaja and Marti Raudsepp: some SELECT FOR UPDATE/SHARE queries may fail to return the tuples, and assertion-enabled builds crash. Fix by having heap_lock_updated_tuple test the xmin and return success immediately if the tuple was created by an aborted transaction. The condition where tuples become invisible occurs when an updated tuple chain is followed by heap_lock_updated_tuple, which reports the problem as HeapTupleSelfUpdated to its caller heap_lock_tuple, which in turn propagates that code outwards possibly leading the calling code (ExecLockRows) to believe that the tuple exists no longer. Backpatch to 9.3. Only on 9.5 and newer this leads to a visible failure, because of commit 27846f02c176; before that, heap_lock_tuple skips the whole dance when the tuple is already locked by the same transaction, because of the ancient HeapTupleSatisfiesUpdate behavior. Still, the buggy condition may also exist in more convoluted scenarios involving concurrent transactions, so it seems safer to fix the bug in the old branches too. Discussion: https://www.postgresql.org/message-id/CABRT9RC81YUf1=jsmWopcKJEro=VoeG2ou6sPwyOUTx_qteRsg@mail.gmail.com https://www.postgresql.org/message-id/48d3eade-98d3-8b9a-477e-1a8dc32a724d@joh.to
2016-09-09 20:54:29 +02:00
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
old_infomask = mytup.t_data->t_infomask;
old_infomask2 = mytup.t_data->t_infomask2;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
/*
* If this tuple version has been updated or locked by some concurrent
* transaction(s), what we do depends on whether our lock mode
* conflicts with what those other transactions hold, and also on the
* status of them.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
if (!(old_infomask & HEAP_XMAX_INVALID))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
TransactionId rawxmax;
bool needwait;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
if (old_infomask & HEAP_XMAX_IS_MULTI)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
int nmembers;
int i;
MultiXactMember *members;
Fix handling of multixacts predating pg_upgrade After pg_upgrade, it is possible that some tuples' Xmax have multixacts corresponding to the old installation; such multixacts cannot have running members anymore. In many code sites we already know not to read them and clobber them silently, but at least when VACUUM tries to freeze a multixact or determine whether one needs freezing, there's an attempt to resolve it to its member transactions by calling GetMultiXactIdMembers, and if the multixact value is "in the future" with regards to the current valid multixact range, an error like this is raised: ERROR: MultiXactId 123 has not been created yet -- apparent wraparound and vacuuming fails. Per discussion with Andrew Gierth, it is completely bogus to try to resolve multixacts coming from before a pg_upgrade, regardless of where they stand with regards to the current valid multixact range. It's possible to get from under this problem by doing SELECT FOR UPDATE of the problem tuples, but if tables are large, this is slow and tedious, so a more thorough solution is desirable. To fix, we realize that multixacts in xmax created in 9.2 and previous have a specific bit pattern that is never used in 9.3 and later (we already knew this, per comments and infomask tests sprinkled in various places, but we weren't leveraging this knowledge appropriately). Whenever the infomask of the tuple matches that bit pattern, we just ignore the multixact completely as if Xmax wasn't set; or, in the case of tuple freezing, we act as if an unwanted value is set and clobber it without decoding. This guarantees that no errors will be raised, and that the values will be progressively removed until all tables are clean. Most callers of GetMultiXactIdMembers are patched to recognize directly that the value is a removable "empty" multixact and avoid calling GetMultiXactIdMembers altogether. To avoid changing the signature of GetMultiXactIdMembers() in back branches, we keep the "allow_old" boolean flag but rename it to "from_pgupgrade"; if the flag is true, we always return an empty set instead of looking up the multixact. (I suppose we could remove the argument in the master branch, but I chose not to do so in this commit). This was broken all along, but the error-facing message appeared first because of commit 8e9a16ab8f7f and was partially fixed in a25c2b7c4db3. This fix, backpatched all the way back to 9.3, goes approximately in the same direction as a25c2b7c4db3 but should cover all cases. Bug analysis by Andrew Gierth and Álvaro Herrera. A number of public reports match this bug: https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-25 00:29:28 +02:00
/*
* We don't need a test for pg_upgrade'd tuples: this is only
* applied to tuples after the first in an update chain. Said
* first tuple in the chain may well be locked-in-9.2-and-
* pg_upgraded, but that one was already locked by our caller,
* not us; and any subsequent ones cannot be because our
* caller must necessarily have obtained a snapshot later than
* the pg_upgrade itself.
*/
Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
for (i = 0; i < nmembers; i++)
{
result = test_lockmode_for_conflict(members[i].status,
members[i].xid,
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
mode,
&mytup,
&needwait);
/*
* If the tuple was already locked by ourselves in a
* previous iteration of this (say heap_lock_tuple was
* forced to restart the locking loop because of a change
* in xmax), then we hold the lock already on this tuple
* version and we don't need to do anything; and this is
* not an error condition either. We just need to skip
* this tuple and continue locking the next version in the
* update chain.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result == TM_SelfModified)
{
pfree(members);
goto next;
}
if (needwait)
{
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
XactLockTableWait(members[i].xid, rel,
&mytup.t_self,
XLTW_LockUpdated);
pfree(members);
goto l4;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result != TM_Ok)
{
pfree(members);
goto out_locked;
}
}
if (members)
pfree(members);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
else
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
MultiXactStatus status;
/*
* For a non-multi Xmax, we first need to compute the
* corresponding MultiXactStatus by using the infomask bits.
*/
if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
{
if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
status = MultiXactStatusForKeyShare;
else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
status = MultiXactStatusForShare;
else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
{
if (old_infomask2 & HEAP_KEYS_UPDATED)
status = MultiXactStatusForUpdate;
else
status = MultiXactStatusForNoKeyUpdate;
}
else
{
/*
* LOCK_ONLY present alone (a pg_upgraded tuple marked
* as share-locked in the old cluster) shouldn't be
* seen in the middle of an update chain.
*/
elog(ERROR, "invalid lock status in tuple");
}
}
else
{
/* it's an update, but which kind? */
if (old_infomask2 & HEAP_KEYS_UPDATED)
status = MultiXactStatusUpdate;
else
status = MultiXactStatusNoKeyUpdate;
}
result = test_lockmode_for_conflict(status, rawxmax, mode,
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
&mytup, &needwait);
/*
* If the tuple was already locked by ourselves in a previous
* iteration of this (say heap_lock_tuple was forced to
* restart the locking loop because of a change in xmax), then
* we hold the lock already on this tuple version and we don't
* need to do anything; and this is not an error condition
* either. We just need to skip this tuple and continue
* locking the next version in the update chain.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result == TM_SelfModified)
goto next;
if (needwait)
{
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
XactLockTableWait(rawxmax, rel, &mytup.t_self,
XLTW_LockUpdated);
goto l4;
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
if (result != TM_Ok)
{
goto out_locked;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
}
/* compute the new Xmax and infomask values for the tuple ... */
compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
xid, mode, false,
&new_xmax, &new_infomask, &new_infomask2);
if (PageIsAllVisible(BufferGetPage(buf)) &&
visibilitymap_clear(rel, block, vmbuffer,
VISIBILITYMAP_ALL_FROZEN))
cleared_all_frozen = true;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
START_CRIT_SECTION();
/* ... and set them */
HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
mytup.t_data->t_infomask |= new_infomask;
mytup.t_data->t_infomask2 |= new_infomask2;
MarkBufferDirty(buf);
/* XLOG stuff */
if (RelationNeedsWAL(rel))
{
xl_heap_lock_updated xlrec;
XLogRecPtr recptr;
Page page = BufferGetPage(buf);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
xlrec.xmax = new_xmax;
xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
xlrec.flags =
cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
next:
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/* if we find the end of update chain, we're done. */
if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
2018-04-07 22:24:10 +02:00
HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
HeapTupleHeaderIsOnlyLocked(mytup.t_data))
{
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
goto out_locked;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/* tail recursion */
priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
UnlockReleaseBuffer(buf);
}
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
result = TM_Ok;
out_locked:
UnlockReleaseBuffer(buf);
out_unlocked:
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
return result;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/*
* heap_lock_updated_tuple
* Follow update chain when locking an updated tuple, acquiring locks (row
* marks) on the updated versions.
*
* The initial tuple is assumed to be already locked.
*
* This function doesn't check visibility, it just unconditionally marks the
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* tuple(s) as locked. If any tuple in the updated chain is being deleted
* concurrently (or updated with the key being modified), sleep until the
* transaction doing it is finished.
*
* Note that we don't acquire heavyweight tuple locks on the tuples we walk
* when we have to wait for other transactions to release them, as opposed to
* what heap_lock_tuple does. The reason is that having more than one
* transaction walking the chain is probably uncommon enough that risk of
* starvation is not likely: one of the preconditions for being here is that
* the snapshot in use predates the update that created this tuple (because we
* started at an earlier version of the tuple), but at the same time such a
* transaction cannot be using repeatable read or serializable isolation
* levels, because that would lead to a serializability failure.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
static TM_Result
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
TransactionId xid, LockTupleMode mode)
{
2018-04-07 22:24:10 +02:00
/*
* If the tuple has not been updated, or has moved into another partition
* (effectively a delete) stop here.
*/
if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
!ItemPointerEquals(&tuple->t_self, ctid))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
/*
* If this is the first possibly-multixact-able operation in the
* current transaction, set my per-backend OldestMemberMXactId
* setting. We can be certain that the transaction will never become a
* member of any older MultiXactIds than that. (We have to do this
* even if we end up just using our own TransactionId below, since
* some other backend could incorporate our XID into a MultiXact
* immediately afterwards.)
*/
MultiXactIdSetOldestMember();
return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
}
/* nothing to lock */
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
return TM_Ok;
}
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
/*
* heap_finish_speculative - mark speculative insertion as successful
*
* To successfully finish a speculative insertion we have to clear speculative
* token from tuple. To do so the t_ctid field, which will contain a
* speculative token value, is modified in place to point to the tuple itself,
* which is characteristic of a newly inserted ordinary tuple.
*
* NB: It is not ok to commit without either finishing or aborting a
* speculative insertion. We could treat speculative tuples of committed
* transactions implicitly as completed, but then we would have to be prepared
* to deal with speculative tokens on committed tuples. That wouldn't be
* difficult - no-one looks at the ctid field of a tuple with invalid xmax -
* but clearing the token at completion isn't very expensive either.
* An explicit confirmation WAL record also makes logical decoding simpler.
*/
void
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
heap_finish_speculative(Relation relation, ItemPointer tid)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
{
Buffer buffer;
Page page;
OffsetNumber offnum;
ItemId lp = NULL;
HeapTupleHeader htup;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = (Page) BufferGetPage(buffer);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
offnum = ItemPointerGetOffsetNumber(tid);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (PageGetMaxOffsetNumber(page) >= offnum)
lp = PageGetItemId(page, offnum);
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(ERROR, "invalid lp");
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
htup = (HeapTupleHeader) PageGetItem(page, lp);
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
Assert(HeapTupleHeaderIsSpeculative(htup));
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
MarkBufferDirty(buffer);
/*
* Replace the speculative insertion token with a real t_ctid, pointing to
* itself like it does on regular tuples.
*/
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
htup->t_ctid = *tid;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_confirm xlrec;
XLogRecPtr recptr;
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
xlrec.offnum = ItemPointerGetOffsetNumber(tid);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
XLogBeginInsert();
/* We want the same filtering on this as on a plain insert */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
UnlockReleaseBuffer(buffer);
}
/*
* heap_abort_speculative - kill a speculatively inserted tuple
*
* Marks a tuple that was speculatively inserted in the same command as dead,
* by setting its xmin as invalid. That makes it immediately appear as dead
* to all transactions, including our own. In particular, it makes
* HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
* inserting a duplicate key value won't unnecessarily wait for our whole
* transaction to finish (it'll just wait for our speculative insertion to
* finish).
*
* Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
* that arise due to a mutual dependency that is not user visible. By
* definition, unprincipled deadlocks cannot be prevented by the user
* reordering lock acquisition in client code, because the implementation level
* lock acquisitions are not under the user's direct control. If speculative
* inserters did not take this precaution, then under high concurrency they
* could deadlock with each other, which would not be acceptable.
*
* This is somewhat redundant with heap_delete, but we prefer to have a
* dedicated routine with stripped down requirements. Note that this is also
* used to delete the TOAST tuples created during speculative insertion.
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
*
* This routine does not affect logical decoding as it only looks at
* confirmation records.
*/
void
tableam: Add tuple_{insert, delete, update, lock} and use. This adds new, required, table AM callbacks for insert/delete/update and lock_tuple. To be able to reasonably use those, the EvalPlanQual mechanism had to be adapted, moving more logic into the AM. Previously both delete/update/lock call-sites and the EPQ mechanism had to have awareness of the specific tuple format to be able to fetch the latest version of a tuple. Obviously that needs to be abstracted away. To do so, move the logic that find the latest row version into the AM. lock_tuple has a new flag argument, TUPLE_LOCK_FLAG_FIND_LAST_VERSION, that forces it to lock the last version, rather than the current one. It'd have been possible to do so via a separate callback as well, but finding the last version usually also necessitates locking the newest version, making it sensible to combine the two. This replaces the previous use of EvalPlanQualFetch(). Additionally HeapTupleUpdated, which previously signaled either a concurrent update or delete, is now split into two, to avoid callers needing AM specific knowledge to differentiate. The move of finding the latest row version into tuple_lock means that encountering a row concurrently moved into another partition will now raise an error about "tuple to be locked" rather than "tuple to be updated/deleted" - which is accurate, as that always happens when locking rows. While possible slightly less helpful for users, it seems like an acceptable trade-off. As part of this commit HTSU_Result has been renamed to TM_Result, and its members been expanded to differentiated between updating and deleting. HeapUpdateFailureData has been renamed to TM_FailureData. The interface to speculative insertion is changed so nodeModifyTable.c does not have to set the speculative token itself anymore. Instead there's a version of tuple_insert, tuple_insert_speculative, that performs the speculative insertion (without requiring a flag to signal that fact), and the speculative insertion is either made permanent with table_complete_speculative(succeeded = true) or aborted with succeeded = false). Note that multi_insert is not yet routed through tableam, nor is COPY. Changing multi_insert requires changes to copy.c that are large enough to better be done separately. Similarly, although simpler, CREATE TABLE AS and CREATE MATERIALIZED VIEW are also only going to be adjusted in a later commit. Author: Andres Freund and Haribabu Kommi Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20190313003903.nwvrxi7rw3ywhdel@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-24 03:55:57 +01:00
heap_abort_speculative(Relation relation, ItemPointer tid)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
{
TransactionId xid = GetCurrentTransactionId();
ItemId lp;
HeapTupleData tp;
Page page;
BlockNumber block;
Buffer buffer;
TransactionId prune_xid;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
Assert(ItemPointerIsValid(tid));
block = ItemPointerGetBlockNumber(tid);
buffer = ReadBuffer(relation, block);
page = BufferGetPage(buffer);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
/*
* Page can't be all visible, we just inserted into it, and are still
* running.
*/
Assert(!PageIsAllVisible(page));
lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
Assert(ItemIdIsNormal(lp));
tp.t_tableOid = RelationGetRelid(relation);
tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
tp.t_len = ItemIdGetLength(lp);
tp.t_self = *tid;
/*
* Sanity check that the tuple really is a speculatively inserted tuple,
* inserted by us.
*/
if (tp.t_data->t_choice.t_heap.t_xmin != xid)
elog(ERROR, "attempted to kill a tuple inserted by another transaction");
if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
elog(ERROR, "attempted to kill a non-speculative tuple");
Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
/*
* No need to check for serializable conflicts here. There is never a
* need for a combo CID, either. No need to extract replica identity, or
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
* do anything special with infomask bits.
*/
START_CRIT_SECTION();
/*
* The tuple will become DEAD immediately. Flag that this page is a
* candidate for pruning by setting xmin to TransactionXmin. While not
* immediately prunable, it is the oldest xid we can cheaply determine
* that's safe against wraparound / being older than the table's
* relfrozenxid. To defend against the unlikely case of a new relation
* having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
* if so (vacuum can't subsequently move relfrozenxid to beyond
* TransactionXmin, so there's no race here).
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
*/
Assert(TransactionIdIsValid(TransactionXmin));
if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
prune_xid = relation->rd_rel->relfrozenxid;
else
prune_xid = TransactionXmin;
PageSetPrunable(page, prune_xid);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
/* store transaction information of xact deleting the tuple */
tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
/*
* Set the tuple header xmin to InvalidTransactionId. This makes the
* tuple immediately invisible everyone. (In particular, to any
* transactions waiting on the speculative token, woken up later.)
*/
HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
/* Clear the speculative insertion token too */
tp.t_data->t_ctid = tp.t_self;
MarkBufferDirty(buffer);
/*
* XLOG stuff
*
* The WAL records generated here match heap_delete(). The same recovery
* routines are used.
*/
if (RelationNeedsWAL(relation))
{
xl_heap_delete xlrec;
XLogRecPtr recptr;
xlrec.flags = XLH_DELETE_IS_SUPER;
xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
tp.t_data->t_infomask2);
xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
xlrec.xmax = xid;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
/* No replica identity & replication origin logged */
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
if (HeapTupleHasExternal(&tp))
{
Assert(!IsToastRelation(relation));
heap_toast_delete(relation, &tp, true);
}
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
/*
* Never need to mark tuple for invalidation, since catalogs don't support
* speculative insertion
*/
/* Now we can release the buffer */
ReleaseBuffer(buffer);
/* count deletion, as we counted the insertion too */
pgstat_count_heap_delete(relation);
}
/*
* heap_inplace_update - update a tuple "in place" (ie, overwrite it)
*
* Overwriting violates both MVCC and transactional safety, so the uses
* of this function in Postgres are extremely limited. Nonetheless we
* find some places to use it.
*
* The tuple cannot change size, and therefore it's reasonable to assume
* that its null bitmap (if any) doesn't change either. So we just
* overwrite the data portion of the tuple without touching the null
* bitmap or any of the header fields.
*
* tuple is an in-memory tuple structure containing the data to be written
* over the target tuple. Also, tuple->t_self identifies the target tuple.
*
* Note that the tuple updated here had better not come directly from the
* syscache if the relation has a toast relation as this tuple could
* include toast values that have been expanded, causing a failure here.
*/
void
heap_inplace_update(Relation relation, HeapTuple tuple)
{
Buffer buffer;
Page page;
OffsetNumber offnum;
ItemId lp = NULL;
HeapTupleHeader htup;
uint32 oldlen;
uint32 newlen;
/*
* For now, we don't allow parallel updates. Unlike a regular update,
* this should never create a combo CID, so it might be possible to relax
* this restriction, but not without more thought and testing. It's not
* clear that it would be useful, anyway.
*/
if (IsInParallelMode())
ereport(ERROR,
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
errmsg("cannot update tuples during a parallel operation")));
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
page = (Page) BufferGetPage(buffer);
offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
if (PageGetMaxOffsetNumber(page) >= offnum)
lp = PageGetItemId(page, offnum);
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(ERROR, "invalid lp");
htup = (HeapTupleHeader) PageGetItem(page, lp);
oldlen = ItemIdGetLength(lp) - htup->t_hoff;
newlen = tuple->t_len - tuple->t_data->t_hoff;
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
elog(ERROR, "wrong tuple length");
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
memcpy((char *) htup + htup->t_hoff,
(char *) tuple->t_data + tuple->t_data->t_hoff,
newlen);
MarkBufferDirty(buffer);
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_inplace xlrec;
XLogRecPtr recptr;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/* inplace updates aren't decoded atm, don't log the origin */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
UnlockReleaseBuffer(buffer);
/*
* Send out shared cache inval if necessary. Note that because we only
* pass the new version of the tuple, this mustn't be used for any
* operations that could change catcache lookup keys. But we aren't
* bothering with index updates either, so that's true a fortiori.
*/
if (!IsBootstrapProcessingMode())
CacheInvalidateHeapTuple(relation, tuple, NULL);
}
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
#define FRM_NOOP 0x0001
#define FRM_INVALIDATE_XMAX 0x0002
#define FRM_RETURN_IS_XID 0x0004
#define FRM_RETURN_IS_MULTI 0x0008
#define FRM_MARK_COMMITTED 0x0010
/*
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
* FreezeMultiXactId
* Determine what to do during freezing when a tuple is marked by a
* MultiXactId.
*
* "flags" is an output value; it's used to tell caller what to do on return.
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* "pagefrz" is an input/output value, used to manage page level freezing.
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
*
* Possible values that we can set in "flags":
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
* FRM_NOOP
* don't do anything -- keep existing Xmax
* FRM_INVALIDATE_XMAX
* mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
* FRM_RETURN_IS_XID
* The Xid return value is a single update Xid to set as xmax.
* FRM_MARK_COMMITTED
* Xmax can be marked as HEAP_XMAX_COMMITTED
* FRM_RETURN_IS_MULTI
* The return value is a new MultiXactId to set as new Xmax.
* (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* Caller delegates control of page freezing to us. In practice we always
* force freezing of caller's page unless FRM_NOOP processing is indicated.
* We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
* can never be left behind. We freely choose when and how to process each
* Multi, without ever violating the cutoff postconditions for freezing.
*
* It's useful to remove Multis on a proactive timeline (relative to freezing
* XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also
* be cheaper in the short run, for us, since we too can avoid SLRU buffer
* misses through eager processing.
*
* NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
* when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
* This can usually be put off, which is usually enough to avoid it altogether.
* Allocating new multis during VACUUM should be avoided on general principle;
* only VACUUM can advance relminmxid, so allocating new Multis here comes with
* its own special risks.
*
* NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
* using heap_tuple_should_freeze when we haven't forced page-level freezing.
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
* have already forced page-level freezing, since that might incur the same
* SLRU buffer misses that we specifically intended to avoid by freezing.
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*/
static TransactionId
FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
const struct VacuumCutoffs *cutoffs, uint16 *flags,
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
HeapPageFreeze *pagefrz)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
TransactionId newxmax;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
MultiXactMember *members;
int nmembers;
bool need_replace;
int nnewmembers;
MultiXactMember *newmembers;
bool has_lockers;
TransactionId update_xid;
bool update_committed;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
TransactionId FreezePageRelfrozenXid;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*flags = 0;
/* We should only be called in Multis */
Assert(t_infomask & HEAP_XMAX_IS_MULTI);
Fix handling of multixacts predating pg_upgrade After pg_upgrade, it is possible that some tuples' Xmax have multixacts corresponding to the old installation; such multixacts cannot have running members anymore. In many code sites we already know not to read them and clobber them silently, but at least when VACUUM tries to freeze a multixact or determine whether one needs freezing, there's an attempt to resolve it to its member transactions by calling GetMultiXactIdMembers, and if the multixact value is "in the future" with regards to the current valid multixact range, an error like this is raised: ERROR: MultiXactId 123 has not been created yet -- apparent wraparound and vacuuming fails. Per discussion with Andrew Gierth, it is completely bogus to try to resolve multixacts coming from before a pg_upgrade, regardless of where they stand with regards to the current valid multixact range. It's possible to get from under this problem by doing SELECT FOR UPDATE of the problem tuples, but if tables are large, this is slow and tedious, so a more thorough solution is desirable. To fix, we realize that multixacts in xmax created in 9.2 and previous have a specific bit pattern that is never used in 9.3 and later (we already knew this, per comments and infomask tests sprinkled in various places, but we weren't leveraging this knowledge appropriately). Whenever the infomask of the tuple matches that bit pattern, we just ignore the multixact completely as if Xmax wasn't set; or, in the case of tuple freezing, we act as if an unwanted value is set and clobber it without decoding. This guarantees that no errors will be raised, and that the values will be progressively removed until all tables are clean. Most callers of GetMultiXactIdMembers are patched to recognize directly that the value is a removable "empty" multixact and avoid calling GetMultiXactIdMembers altogether. To avoid changing the signature of GetMultiXactIdMembers() in back branches, we keep the "allow_old" boolean flag but rename it to "from_pgupgrade"; if the flag is true, we always return an empty set instead of looking up the multixact. (I suppose we could remove the argument in the master branch, but I chose not to do so in this commit). This was broken all along, but the error-facing message appeared first because of commit 8e9a16ab8f7f and was partially fixed in a25c2b7c4db3. This fix, backpatched all the way back to 9.3, goes approximately in the same direction as a25c2b7c4db3 but should cover all cases. Bug analysis by Andrew Gierth and Álvaro Herrera. A number of public reports match this bug: https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-25 00:29:28 +02:00
if (!MultiXactIdIsValid(multi) ||
HEAP_LOCKED_UPGRADED(t_infomask))
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
*flags |= FRM_INVALIDATE_XMAX;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
pagefrz->freeze_required = true;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
return InvalidTransactionId;
}
else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("found multixact %u from before relminmxid %u",
multi, cutoffs->relminmxid)));
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
TransactionId update_xact;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* This old multi cannot possibly have members still running, but
* verify just in case. If it was a locker only, it can be removed
* without any further consideration; but if it contained an update,
* we might need to preserve it.
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*/
if (MultiXactIdIsRunning(multi,
HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
multi, cutoffs->OldestMxact)));
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
{
*flags |= FRM_INVALIDATE_XMAX;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
pagefrz->freeze_required = true;
return InvalidTransactionId;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* replace multi with single XID for its updater? */
update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
multi, update_xact,
cutoffs->relfrozenxid)));
else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
{
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* Updater XID has to have aborted (otherwise the tuple would have
* been pruned away instead, since updater XID is < OldestXmin).
* Just remove xmax.
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*/
if (TransactionIdDidCommit(update_xact))
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
multi, update_xact,
cutoffs->OldestXmin)));
*flags |= FRM_INVALIDATE_XMAX;
pagefrz->freeze_required = true;
return InvalidTransactionId;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* Have to keep updater XID as new xmax */
*flags |= FRM_RETURN_IS_XID;
pagefrz->freeze_required = true;
return update_xact;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
/*
* Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
* need to walk the whole members array to figure out what to do, if
* anything.
*/
nmembers =
Fix handling of multixacts predating pg_upgrade After pg_upgrade, it is possible that some tuples' Xmax have multixacts corresponding to the old installation; such multixacts cannot have running members anymore. In many code sites we already know not to read them and clobber them silently, but at least when VACUUM tries to freeze a multixact or determine whether one needs freezing, there's an attempt to resolve it to its member transactions by calling GetMultiXactIdMembers, and if the multixact value is "in the future" with regards to the current valid multixact range, an error like this is raised: ERROR: MultiXactId 123 has not been created yet -- apparent wraparound and vacuuming fails. Per discussion with Andrew Gierth, it is completely bogus to try to resolve multixacts coming from before a pg_upgrade, regardless of where they stand with regards to the current valid multixact range. It's possible to get from under this problem by doing SELECT FOR UPDATE of the problem tuples, but if tables are large, this is slow and tedious, so a more thorough solution is desirable. To fix, we realize that multixacts in xmax created in 9.2 and previous have a specific bit pattern that is never used in 9.3 and later (we already knew this, per comments and infomask tests sprinkled in various places, but we weren't leveraging this knowledge appropriately). Whenever the infomask of the tuple matches that bit pattern, we just ignore the multixact completely as if Xmax wasn't set; or, in the case of tuple freezing, we act as if an unwanted value is set and clobber it without decoding. This guarantees that no errors will be raised, and that the values will be progressively removed until all tables are clean. Most callers of GetMultiXactIdMembers are patched to recognize directly that the value is a removable "empty" multixact and avoid calling GetMultiXactIdMembers altogether. To avoid changing the signature of GetMultiXactIdMembers() in back branches, we keep the "allow_old" boolean flag but rename it to "from_pgupgrade"; if the flag is true, we always return an empty set instead of looking up the multixact. (I suppose we could remove the argument in the master branch, but I chose not to do so in this commit). This was broken all along, but the error-facing message appeared first because of commit 8e9a16ab8f7f and was partially fixed in a25c2b7c4db3. This fix, backpatched all the way back to 9.3, goes approximately in the same direction as a25c2b7c4db3 but should cover all cases. Bug analysis by Andrew Gierth and Álvaro Herrera. A number of public reports match this bug: https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-25 00:29:28 +02:00
GetMultiXactIdMembers(multi, &members, false,
HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
if (nmembers <= 0)
{
/* Nothing worth keeping */
*flags |= FRM_INVALIDATE_XMAX;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
pagefrz->freeze_required = true;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
return InvalidTransactionId;
}
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/*
* The FRM_NOOP case is the only case where we might need to ratchet back
* FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
* case where our caller might ratchet back its NoFreezePageRelfrozenXid
* or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
* FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
* trackers managed by VACUUM being ratcheting back by xmax to the degree
* required to make it safe to leave xmax undisturbed, independent of
* whether or not page freezing is triggered somewhere else.
*
* Our policy is to force freezing in every case other than FRM_NOOP,
* which obviates the need to maintain either set of trackers, anywhere.
* Every other case will reliably execute a freeze plan for xmax that
* either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
* sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
* (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
* OldestXmin/OldestMxact, so later values never need to be tracked here.)
*/
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
need_replace = false;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
for (int i = 0; i < nmembers; i++)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
TransactionId xid = members[i].xid;
Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* Can't violate the FreezeLimit postcondition */
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
need_replace = true;
break;
}
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
FreezePageRelfrozenXid = xid;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* Can't violate the MultiXactCutoff postcondition, either */
if (!need_replace)
need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
if (!need_replace)
{
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
* both together to make it safe to retain this particular multi after
* freezing its page
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
*/
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*flags |= FRM_NOOP;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
pagefrz->FreezePageRelminMxid = multi;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
pfree(members);
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
return multi;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
/*
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
* Do a more thorough second pass over the multi to figure out which
* member XIDs actually need to be kept. Checking the precise status of
* individual members might even show that we don't need to keep anything.
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* That is quite possible even though the Multi must be >= OldestMxact,
* since our second pass only keeps member XIDs when it's truly necessary;
* even member XIDs >= OldestXmin often won't be kept by second pass.
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*/
nnewmembers = 0;
newmembers = palloc(sizeof(MultiXactMember) * nmembers);
has_lockers = false;
update_xid = InvalidTransactionId;
update_committed = false;
/*
* Determine whether to keep each member xid, or to ignore it instead
*/
for (int i = 0; i < nmembers; i++)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
TransactionId xid = members[i].xid;
MultiXactStatus mstatus = members[i].status;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
if (!ISUPDATE_from_mxstatus(mstatus))
{
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* Locker XID (not updater XID). We only keep lockers that are
* still running.
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*/
if (TransactionIdIsCurrentTransactionId(xid) ||
TransactionIdIsInProgress(xid))
{
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
multi, xid,
cutoffs->OldestXmin)));
newmembers[nnewmembers++] = members[i];
has_lockers = true;
}
continue;
}
/*
* Updater XID (not locker XID). Should we keep it?
*
* Since the tuple wasn't totally removed when vacuum pruned, the
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* update Xid cannot possibly be older than OldestXmin cutoff unless
* the updater XID aborted. If the updater transaction is known
* aborted or crashed then it's okay to ignore it, otherwise not.
*
* In any case the Multi should never contain two updaters, whatever
* their individual commit status. Check for that first, in passing.
*/
if (TransactionIdIsValid(update_xid))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("multixact %u has two or more updating members",
multi),
errdetail_internal("First updater XID=%u second updater XID=%u.",
update_xid, xid)));
/*
* As with all tuple visibility routines, it's critical to test
* TransactionIdIsInProgress before TransactionIdDidCommit, because of
* race conditions explained in detail in heapam_visibility.c.
*/
if (TransactionIdIsCurrentTransactionId(xid) ||
TransactionIdIsInProgress(xid))
update_xid = xid;
else if (TransactionIdDidCommit(xid))
{
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* The transaction committed, so we can tell caller to set
* HEAP_XMAX_COMMITTED. (We can only do this because we know the
* transaction is not running.)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*/
update_committed = true;
update_xid = xid;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
else
{
/*
* Not in progress, not committed -- must be aborted or crashed;
* we can ignore it.
*/
continue;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
/*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* We determined that updater must be kept -- add it to pending new
* members list
*/
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
multi, xid, cutoffs->OldestXmin)));
newmembers[nnewmembers++] = members[i];
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
pfree(members);
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/*
* Determine what to do with caller's multi based on information gathered
* during our second pass
*/
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
if (nnewmembers == 0)
{
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* Nothing worth keeping */
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*flags |= FRM_INVALIDATE_XMAX;
newxmax = InvalidTransactionId;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
else if (TransactionIdIsValid(update_xid) && !has_lockers)
{
/*
* If there's a single member and it's an update, pass it back alone
* without creating a new Multi. (XXX we could do this when there's a
* single remaining locker, too, but that would complicate the API too
* much; moreover, the case with the single updater is more
* interesting, because those are longer-lived.)
*/
Assert(nnewmembers == 1);
*flags |= FRM_RETURN_IS_XID;
if (update_committed)
*flags |= FRM_MARK_COMMITTED;
newxmax = update_xid;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
else
{
/*
* Create a new multixact with the surviving members of the previous
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* one, to set as new Xmax in the tuple
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*/
newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*flags |= FRM_RETURN_IS_MULTI;
}
pfree(newmembers);
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
pagefrz->freeze_required = true;
return newxmax;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
/*
* heap_prepare_freeze_tuple
*
* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so,
* setup enough state (in the *frz output argument) to enable caller to
* process this tuple as part of freezing its page, and return true. Return
* false if nothing can be changed about the tuple right now.
*
* Also sets *totally_frozen to true if the tuple will be totally frozen once
* caller executes returned freeze plan (or if the tuple was already totally
* frozen by an earlier VACUUM). This indicates that there are no remaining
* XIDs or MultiXactIds that will need to be processed by a future VACUUM.
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
* tuple that we returned true for, and call heap_freeze_execute_prepared to
* execute freezing. Caller must initialize pagefrz fields for page as a
* whole before first call here for each heap page.
*
* VACUUM caller decides on whether or not to freeze the page as a whole.
* We'll often prepare freeze plans for a page that caller just discards.
* However, VACUUM doesn't always get to make a choice; it must freeze when
* pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
* MXIDs < MultiXactCutoff) can never be left behind. We help to make sure
* that VACUUM always follows that rule.
*
* We sometimes force freezing of xmax MultiXactId values long before it is
* strictly necessary to do so just to ensure the FreezeLimit postcondition.
* It's worth processing MultiXactIds proactively when it is cheap to do so,
* and it's convenient to make that happen by piggy-backing it on the "force
* freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds
* because it is expensive right now (though only when it's still possible to
* do so without violating the FreezeLimit/MultiXactCutoff postcondition).
*
* It is assumed that the caller has checked the tuple with
* HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
* (else we should be removing the tuple, not freezing it).
*
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
* NB: This function has side effects: it might allocate a new MultiXactId.
* It will be set as tuple's new xmax when our *frz output is processed within
* heap_execute_freeze_tuple later on. If the tuple is in a shared buffer
* then caller had better have an exclusive lock on it already.
*/
bool
heap_prepare_freeze_tuple(HeapTupleHeader tuple,
const struct VacuumCutoffs *cutoffs,
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
HeapPageFreeze *pagefrz,
HeapTupleFreeze *frz, bool *totally_frozen)
{
bool xmin_already_frozen = false,
xmax_already_frozen = false;
bool freeze_xmin = false,
replace_xvac = false,
replace_xmax = false,
freeze_xmax = false;
TransactionId xid;
frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
frz->t_infomask2 = tuple->t_infomask2;
frz->t_infomask = tuple->t_infomask;
frz->frzflags = 0;
frz->checkflags = 0;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* Process xmin, while keeping track of whether it's already frozen, or
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* will become frozen iff our freeze plan is executed by caller (could be
* neither).
*/
xid = HeapTupleHeaderGetXmin(tuple);
if (!TransactionIdIsNormal(xid))
xmin_already_frozen = true;
else
{
if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("found xmin %u from before relfrozenxid %u",
xid, cutoffs->relfrozenxid)));
/* Will set freeze_xmin flags in freeze plan below */
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
/* Verify that xmin committed if and when freeze plan is executed */
if (freeze_xmin)
frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED;
}
/*
* Old-style VACUUM FULL is gone, but we have to process xvac for as long
* as we support having MOVED_OFF/MOVED_IN tuples in the database
*/
xid = HeapTupleHeaderGetXvac(tuple);
if (TransactionIdIsNormal(xid))
{
Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
/*
* For Xvac, we always freeze proactively. This allows totally_frozen
* tracking to ignore xvac.
*/
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
replace_xvac = pagefrz->freeze_required = true;
/* Will set replace_xvac flags in freeze plan below */
}
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* Now process xmax */
xid = frz->xmax;
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
{
/* Raw xmax is a MultiXactId */
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
TransactionId newxmax;
uint16 flags;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/*
* We will either remove xmax completely (in the "freeze_xmax" path),
* process xmax by replacing it (in the "replace_xmax" path), or
* perform no-op xmax processing. The only constraint is that the
* FreezeLimit/MultiXactCutoff postcondition must never be violated.
*/
newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
&flags, pagefrz);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (flags & FRM_NOOP)
{
/*
* xmax is a MultiXactId, and nothing about it changes for now.
* This is the only case where 'freeze_required' won't have been
* set for us by FreezeMultiXactId, as well as the only case where
* neither freeze_xmax nor replace_xmax are set (given a multi).
*
* This is a no-op, but the call to FreezeMultiXactId might have
* ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
* for us (the "freeze page" variants, specifically). That'll
* make it safe for our caller to freeze the page later on, while
* leaving this particular xmax undisturbed.
*
* FreezeMultiXactId is _not_ responsible for the "no freeze"
* NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
* job. A call to heap_tuple_should_freeze for this same tuple
* will take place below if 'freeze_required' isn't set already.
* (This repeats work from FreezeMultiXactId, but allows "no
* freeze" tracker maintenance to happen in only one place.)
*/
Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff));
Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
}
else if (flags & FRM_RETURN_IS_XID)
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
{
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/*
* xmax will become an updater Xid (original MultiXact's updater
* member Xid will be carried forward as a simple Xid in Xmax).
*/
Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin));
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* NB -- some of these transformations are only valid because we
* know the return Xid is a tuple updater (i.e. not merely a
* locker.) Also note that the only reason we don't explicitly
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
* worry about HEAP_KEYS_UPDATED is because it lives in
* t_infomask2 rather than t_infomask.
*/
frz->t_infomask &= ~HEAP_XMAX_BITS;
frz->xmax = newxmax;
if (flags & FRM_MARK_COMMITTED)
frz->t_infomask |= HEAP_XMAX_COMMITTED;
replace_xmax = true;
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
}
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
else if (flags & FRM_RETURN_IS_MULTI)
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
{
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
uint16 newbits;
uint16 newbits2;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/*
* xmax is an old MultiXactId that we have to replace with a new
* MultiXactId, to carry forward two or more original member XIDs.
*/
Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact));
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
/*
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
* We can't use GetMultiXactIdHintBits directly on the new multi
* here; that routine initializes the masks to all zeroes, which
* would lose other bits we need. Doing it this way ensures all
* unrelated bits remain untouched.
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
*/
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
frz->t_infomask &= ~HEAP_XMAX_BITS;
frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
frz->t_infomask |= newbits;
frz->t_infomask2 |= newbits2;
frz->xmax = newxmax;
replace_xmax = true;
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
}
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
else
{
/*
* Freeze plan for tuple "freezes xmax" in the strictest sense:
* it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
*/
Assert(flags & FRM_INVALIDATE_XMAX);
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
Assert(!TransactionIdIsValid(newxmax));
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* Will set freeze_xmax flags in freeze plan below */
freeze_xmax = true;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
}
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* MultiXactId processing forces freezing (barring FRM_NOOP case) */
Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
}
else if (TransactionIdIsNormal(xid))
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
{
/* Raw xmax is normal XID */
if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("found xmax %u from before relfrozenxid %u",
xid, cutoffs->relfrozenxid)));
/* Will set freeze_xmax flags in freeze plan below */
freeze_xmax = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/*
* Verify that xmax aborted if and when freeze plan is executed,
* provided it's from an update. (A lock-only xmax can be removed
* independent of this, since the lock is released at xact end.)
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
*/
if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
}
else if (!TransactionIdIsValid(xid))
{
/* Raw xmax is InvalidTransactionId XID */
Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
xmax_already_frozen = true;
}
else
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
xid, tuple->t_infomask)));
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
if (freeze_xmin)
{
Assert(!xmin_already_frozen);
frz->t_infomask |= HEAP_XMIN_FROZEN;
}
if (replace_xvac)
{
/*
* If a MOVED_OFF tuple is not dead, the xvac transaction must have
* failed; whereas a non-dead MOVED_IN tuple must mean the xvac
* transaction succeeded.
*/
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
Assert(pagefrz->freeze_required);
if (tuple->t_infomask & HEAP_MOVED_OFF)
frz->frzflags |= XLH_INVALID_XVAC;
else
frz->frzflags |= XLH_FREEZE_XVAC;
}
if (replace_xmax)
{
Assert(!xmax_already_frozen && !freeze_xmax);
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
Assert(pagefrz->freeze_required);
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
/* Already set replace_xmax flags in freeze plan earlier */
}
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
if (freeze_xmax)
{
Assert(!xmax_already_frozen && !replace_xmax);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
frz->xmax = InvalidTransactionId;
2007-11-15 22:14:46 +01:00
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
* LOCKED. Normalize to INVALID just to be sure no one gets confused.
* Also get rid of the HEAP_KEYS_UPDATED bit.
*/
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
frz->t_infomask &= ~HEAP_XMAX_BITS;
frz->t_infomask |= HEAP_XMAX_INVALID;
frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
}
/*
* Determine if this tuple is already totally frozen, or will become
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* totally frozen (provided caller executes freeze plans for the page)
*/
*totally_frozen = ((freeze_xmin || xmin_already_frozen) &&
(freeze_xmax || xmax_already_frozen));
2007-11-15 22:14:46 +01:00
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (!pagefrz->freeze_required && !(xmin_already_frozen &&
xmax_already_frozen))
{
/*
* So far no previous tuple from the page made freezing mandatory.
* Does this tuple force caller to freeze the entire page?
*/
pagefrz->freeze_required =
heap_tuple_should_freeze(tuple, cutoffs,
&pagefrz->NoFreezePageRelfrozenXid,
&pagefrz->NoFreezePageRelminMxid);
}
/* Tell caller if this tuple has a usable freeze plan set in *frz */
return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax;
}
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* heap_execute_freeze_tuple
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
* Execute the prepared freezing of a tuple with caller's freeze plan.
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
*
* Caller is responsible for ensuring that no other backend can access the
* storage underlying this tuple, either by holding an exclusive lock on the
2015-12-18 18:03:15 +01:00
* buffer containing it (which is what lazy VACUUM does), or by having it be
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
* in private storage (which is what CLUSTER and friends do).
*/
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
static inline void
heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
HeapTupleHeaderSetXmax(tuple, frz->xmax);
if (frz->frzflags & XLH_FREEZE_XVAC)
HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
if (frz->frzflags & XLH_INVALID_XVAC)
HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
tuple->t_infomask = frz->t_infomask;
tuple->t_infomask2 = frz->t_infomask2;
}
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
/*
* heap_freeze_execute_prepared
*
* Executes freezing of one or more heap tuples on a page on behalf of caller.
* Caller passes an array of tuple plans from heap_prepare_freeze_tuple.
* Caller must set 'offset' in each plan for us. Note that we destructively
* sort caller's tuples array in-place, so caller had better be done with it.
*
* WAL-logs the changes so that VACUUM can advance the rel's relfrozenxid
* later on without any risk of unsafe pg_xact lookups, even following a hard
* crash (or when querying from a standby). We represent freezing by setting
* infomask bits in tuple headers, but this shouldn't be thought of as a hint.
* See section on buffer access rules in src/backend/storage/buffer/README.
*/
void
heap_freeze_execute_prepared(Relation rel, Buffer buffer,
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
TransactionId snapshotConflictHorizon,
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
HeapTupleFreeze *tuples, int ntuples)
{
Page page = BufferGetPage(buffer);
Assert(ntuples > 0);
/*
* Perform xmin/xmax XID status sanity checks before critical section.
*
* heap_prepare_freeze_tuple doesn't perform these checks directly because
* pg_xact lookups are relatively expensive. They shouldn't be repeated
* by successive VACUUMs that each decide against freezing the same page.
*/
for (int i = 0; i < ntuples; i++)
{
HeapTupleFreeze *frz = tuples + i;
ItemId itemid = PageGetItemId(page, frz->offset);
HeapTupleHeader htup;
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
htup = (HeapTupleHeader) PageGetItem(page, itemid);
/* Deliberately avoid relying on tuple hint bits here */
if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
{
TransactionId xmin = HeapTupleHeaderGetRawXmin(htup);
Assert(!HeapTupleHeaderXminFrozen(htup));
if (unlikely(!TransactionIdDidCommit(xmin)))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("uncommitted xmin %u needs to be frozen",
xmin)));
}
/*
* TransactionIdDidAbort won't work reliably in the presence of XIDs
* left behind by transactions that were in progress during a crash,
* so we can only check that xmax didn't commit
*/
if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
{
TransactionId xmax = HeapTupleHeaderGetRawXmax(htup);
Assert(TransactionIdIsNormal(xmax));
if (unlikely(TransactionIdDidCommit(xmax)))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg_internal("cannot freeze committed xmax %u",
xmax)));
}
}
START_CRIT_SECTION();
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
for (int i = 0; i < ntuples; i++)
{
HeapTupleFreeze *frz = tuples + i;
ItemId itemid = PageGetItemId(page, frz->offset);
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
HeapTupleHeader htup;
htup = (HeapTupleHeader) PageGetItem(page, itemid);
heap_execute_freeze_tuple(htup, frz);
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
}
MarkBufferDirty(buffer);
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
/* Now WAL-log freezing if necessary */
if (RelationNeedsWAL(rel))
{
xl_heap_freeze_plan plans[MaxHeapTuplesPerPage];
OffsetNumber offsets[MaxHeapTuplesPerPage];
int nplans;
xl_heap_freeze_page xlrec;
XLogRecPtr recptr;
/* Prepare deduplicated representation for use in WAL record */
nplans = heap_log_freeze_plan(tuples, ntuples, plans, offsets);
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
Add info in WAL records in preparation for logical slot conflict handling This commit only implements one prerequisite part for allowing logical decoding. The commit message contains an explanation of the overall design, which later commits will refer back to. Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment. It's not needed on the others struct where isCatalogRel has been added. This commit just introduces the WAL format changes mentioned above. Handling the actual conflicts will follow in future commits. Bumps XLOG_PAGE_MAGIC as the several WAL records are changed. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> (in an older version) Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2023-04-02 21:32:19 +02:00
xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(rel);
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
xlrec.nplans = nplans;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
/*
* The freeze plan array and offset array are not actually in the
* buffer, but pretend that they are. When XLogInsert stores the
* whole buffer, the arrays need not be stored too.
*/
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
XLogRegisterBufData(0, (char *) plans,
nplans * sizeof(xl_heap_freeze_plan));
XLogRegisterBufData(0, (char *) offsets,
ntuples * sizeof(OffsetNumber));
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
/*
* Comparator used to deduplicate XLOG_HEAP2_FREEZE_PAGE freeze plans
*/
static int
heap_log_freeze_cmp(const void *arg1, const void *arg2)
{
HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1;
HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2;
if (frz1->xmax < frz2->xmax)
return -1;
else if (frz1->xmax > frz2->xmax)
return 1;
if (frz1->t_infomask2 < frz2->t_infomask2)
return -1;
else if (frz1->t_infomask2 > frz2->t_infomask2)
return 1;
if (frz1->t_infomask < frz2->t_infomask)
return -1;
else if (frz1->t_infomask > frz2->t_infomask)
return 1;
if (frz1->frzflags < frz2->frzflags)
return -1;
else if (frz1->frzflags > frz2->frzflags)
return 1;
/*
* heap_log_freeze_eq would consider these tuple-wise plans to be equal.
* (So the tuples will share a single canonical freeze plan.)
*
* We tiebreak on page offset number to keep each freeze plan's page
* offset number array individually sorted. (Unnecessary, but be tidy.)
*/
if (frz1->offset < frz2->offset)
return -1;
else if (frz1->offset > frz2->offset)
return 1;
Assert(false);
return 0;
}
/*
* Compare fields that describe actions required to freeze tuple with caller's
* open plan. If everything matches then the frz tuple plan is equivalent to
* caller's plan.
*/
static inline bool
heap_log_freeze_eq(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz)
{
if (plan->xmax == frz->xmax &&
plan->t_infomask2 == frz->t_infomask2 &&
plan->t_infomask == frz->t_infomask &&
plan->frzflags == frz->frzflags)
return true;
/* Caller must call heap_log_freeze_new_plan again for frz */
return false;
}
/*
* Start new plan initialized using tuple-level actions. At least one tuple
* will have steps required to freeze described by caller's plan during REDO.
*/
static inline void
heap_log_freeze_new_plan(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz)
{
plan->xmax = frz->xmax;
plan->t_infomask2 = frz->t_infomask2;
plan->t_infomask = frz->t_infomask;
plan->frzflags = frz->frzflags;
plan->ntuples = 1; /* for now */
}
/*
* Deduplicate tuple-based freeze plans so that each distinct set of
* processing steps is only stored once in XLOG_HEAP2_FREEZE_PAGE records.
* Called during original execution of freezing (for logged relations).
*
* Return value is number of plans set in *plans_out for caller. Also writes
* an array of offset numbers into *offsets_out output argument for caller
* (actually there is one array per freeze plan, but that's not of immediate
* concern to our caller).
*/
static int
heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
xl_heap_freeze_plan *plans_out,
OffsetNumber *offsets_out)
{
int nplans = 0;
/* Sort tuple-based freeze plans in the order required to deduplicate */
qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
for (int i = 0; i < ntuples; i++)
{
HeapTupleFreeze *frz = tuples + i;
if (i == 0)
{
/* New canonical freeze plan starting with first tup */
heap_log_freeze_new_plan(plans_out, frz);
nplans++;
}
else if (heap_log_freeze_eq(plans_out, frz))
{
/* tup matches open canonical plan -- include tup in it */
Assert(offsets_out[i - 1] < frz->offset);
plans_out->ntuples++;
}
else
{
/* Tup doesn't match current plan -- done with it now */
plans_out++;
/* New canonical freeze plan starting with this tup */
heap_log_freeze_new_plan(plans_out, frz);
nplans++;
}
/*
* Save page offset number in dedicated buffer in passing.
*
* REDO routine relies on the record's offset numbers array grouping
* offset numbers by freeze plan. The sort order within each grouping
* is ascending offset number order, just to keep things tidy.
*/
offsets_out[i] = frz->offset;
}
Assert(nplans > 0 && nplans <= ntuples);
return nplans;
}
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* heap_freeze_tuple
* Freeze tuple in place, without WAL logging.
*
* Useful for callers like CLUSTER that perform their own WAL logging.
*/
bool
heap_freeze_tuple(HeapTupleHeader tuple,
TransactionId relfrozenxid, TransactionId relminmxid,
TransactionId FreezeLimit, TransactionId MultiXactCutoff)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
HeapTupleFreeze frz;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
bool do_freeze;
bool totally_frozen;
struct VacuumCutoffs cutoffs;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
HeapPageFreeze pagefrz;
cutoffs.relfrozenxid = relfrozenxid;
cutoffs.relminmxid = relminmxid;
cutoffs.OldestXmin = FreezeLimit;
cutoffs.OldestMxact = MultiXactCutoff;
cutoffs.FreezeLimit = FreezeLimit;
cutoffs.MultiXactCutoff = MultiXactCutoff;
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
pagefrz.freeze_required = true;
pagefrz.FreezePageRelfrozenXid = FreezeLimit;
pagefrz.FreezePageRelminMxid = MultiXactCutoff;
pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
&pagefrz, &frz, &totally_frozen);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* Note that because this is not a WAL-logged operation, we don't need to
* fill in the offset in the freeze record.
*/
if (do_freeze)
heap_execute_freeze_tuple(tuple, &frz);
return do_freeze;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* For a given MultiXactId, return the hint bits that should be set in the
* tuple's infomask.
*
* Normally this should be called for a multixact that was just created, and
* so is on our local cache, so the GetMembers call is fast.
*/
static void
GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
uint16 *new_infomask2)
{
int nmembers;
MultiXactMember *members;
int i;
uint16 bits = HEAP_XMAX_IS_MULTI;
uint16 bits2 = 0;
bool has_update = false;
LockTupleMode strongest = LockTupleKeyShare;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* We only use this in multis we just created, so they cannot be values
* pre-pg_upgrade.
*/
nmembers = GetMultiXactIdMembers(multi, &members, false, false);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
for (i = 0; i < nmembers; i++)
{
LockTupleMode mode;
/*
* Remember the strongest lock mode held by any member of the
* multixact.
*/
mode = TUPLOCK_from_mxstatus(members[i].status);
if (mode > strongest)
strongest = mode;
/* See what other bits we need */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
switch (members[i].status)
{
case MultiXactStatusForKeyShare:
case MultiXactStatusForShare:
case MultiXactStatusForNoKeyUpdate:
break;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
case MultiXactStatusForUpdate:
bits2 |= HEAP_KEYS_UPDATED;
break;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
case MultiXactStatusNoKeyUpdate:
has_update = true;
break;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
case MultiXactStatusUpdate:
bits2 |= HEAP_KEYS_UPDATED;
has_update = true;
break;
}
}
if (strongest == LockTupleExclusive ||
strongest == LockTupleNoKeyExclusive)
bits |= HEAP_XMAX_EXCL_LOCK;
else if (strongest == LockTupleShare)
bits |= HEAP_XMAX_SHR_LOCK;
else if (strongest == LockTupleKeyShare)
bits |= HEAP_XMAX_KEYSHR_LOCK;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (!has_update)
bits |= HEAP_XMAX_LOCK_ONLY;
if (nmembers > 0)
pfree(members);
*new_infomask = bits;
*new_infomask2 = bits2;
}
/*
* MultiXactIdGetUpdateXid
*
* Given a multixact Xmax and corresponding infomask, which does not have the
* HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
* transaction.
*
* Caller is expected to check the status of the updating transaction, if
* necessary.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
static TransactionId
MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
{
TransactionId update_xact = InvalidTransactionId;
MultiXactMember *members;
int nmembers;
Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
Assert(t_infomask & HEAP_XMAX_IS_MULTI);
/*
* Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
* pre-pg_upgrade.
*/
nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (nmembers > 0)
{
int i;
for (i = 0; i < nmembers; i++)
{
/* Ignore lockers */
if (!ISUPDATE_from_mxstatus(members[i].status))
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
continue;
/* there can be at most one updater */
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Assert(update_xact == InvalidTransactionId);
update_xact = members[i].xid;
#ifndef USE_ASSERT_CHECKING
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* in an assert-enabled build, walk the whole array to ensure
* there's no other updater.
*/
break;
#endif
}
pfree(members);
}
return update_xact;
}
/*
* HeapTupleGetUpdateXid
* As above, but use a HeapTupleHeader
*
* See also HeapTupleHeaderGetUpdateXid, which can be used without previously
* checking the hint bits.
*/
TransactionId
HeapTupleGetUpdateXid(HeapTupleHeader tuple)
{
return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
tuple->t_infomask);
}
/*
* Does the given multixact conflict with the current transaction grabbing a
* tuple lock of the given strength?
*
* The passed infomask pairs up with the given multixact in the tuple header.
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
*
* If current_is_member is not NULL, it is set to 'true' if the current
* transaction is a member of the given multixact.
*/
static bool
DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
LockTupleMode lockmode, bool *current_is_member)
{
int nmembers;
MultiXactMember *members;
bool result = false;
LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
Fix handling of multixacts predating pg_upgrade After pg_upgrade, it is possible that some tuples' Xmax have multixacts corresponding to the old installation; such multixacts cannot have running members anymore. In many code sites we already know not to read them and clobber them silently, but at least when VACUUM tries to freeze a multixact or determine whether one needs freezing, there's an attempt to resolve it to its member transactions by calling GetMultiXactIdMembers, and if the multixact value is "in the future" with regards to the current valid multixact range, an error like this is raised: ERROR: MultiXactId 123 has not been created yet -- apparent wraparound and vacuuming fails. Per discussion with Andrew Gierth, it is completely bogus to try to resolve multixacts coming from before a pg_upgrade, regardless of where they stand with regards to the current valid multixact range. It's possible to get from under this problem by doing SELECT FOR UPDATE of the problem tuples, but if tables are large, this is slow and tedious, so a more thorough solution is desirable. To fix, we realize that multixacts in xmax created in 9.2 and previous have a specific bit pattern that is never used in 9.3 and later (we already knew this, per comments and infomask tests sprinkled in various places, but we weren't leveraging this knowledge appropriately). Whenever the infomask of the tuple matches that bit pattern, we just ignore the multixact completely as if Xmax wasn't set; or, in the case of tuple freezing, we act as if an unwanted value is set and clobber it without decoding. This guarantees that no errors will be raised, and that the values will be progressively removed until all tables are clean. Most callers of GetMultiXactIdMembers are patched to recognize directly that the value is a removable "empty" multixact and avoid calling GetMultiXactIdMembers altogether. To avoid changing the signature of GetMultiXactIdMembers() in back branches, we keep the "allow_old" boolean flag but rename it to "from_pgupgrade"; if the flag is true, we always return an empty set instead of looking up the multixact. (I suppose we could remove the argument in the master branch, but I chose not to do so in this commit). This was broken all along, but the error-facing message appeared first because of commit 8e9a16ab8f7f and was partially fixed in a25c2b7c4db3. This fix, backpatched all the way back to 9.3, goes approximately in the same direction as a25c2b7c4db3 but should cover all cases. Bug analysis by Andrew Gierth and Álvaro Herrera. A number of public reports match this bug: https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-25 00:29:28 +02:00
if (HEAP_LOCKED_UPGRADED(infomask))
return false;
nmembers = GetMultiXactIdMembers(multi, &members, false,
HEAP_XMAX_IS_LOCKED_ONLY(infomask));
if (nmembers >= 0)
{
int i;
for (i = 0; i < nmembers; i++)
{
TransactionId memxid;
LOCKMODE memlockmode;
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
if (result && (current_is_member == NULL || *current_is_member))
break;
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
memlockmode = LOCKMODE_from_mxstatus(members[i].status);
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
/* ignore members from current xact (but track their presence) */
memxid = members[i].xid;
if (TransactionIdIsCurrentTransactionId(memxid))
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
{
if (current_is_member != NULL)
*current_is_member = true;
continue;
}
else if (result)
continue;
/* ignore members that don't conflict with the lock we want */
if (!DoLockModesConflict(memlockmode, wanted))
continue;
if (ISUPDATE_from_mxstatus(members[i].status))
{
/* ignore aborted updaters */
if (TransactionIdDidAbort(memxid))
continue;
}
else
{
/* ignore lockers-only that are no longer in progress */
if (!TransactionIdIsInProgress(memxid))
continue;
}
/*
* Whatever remains are either live lockers that conflict with our
* wanted lock, and updaters that are not aborted. Those conflict
Avoid spurious deadlocks when upgrading a tuple lock This puts back reverted commit de87a084c0a5, with some bug fixes. When two (or more) transactions are waiting for transaction T1 to release a tuple-level lock, and transaction T1 upgrades its lock to a higher level, a spurious deadlock can be reported among the waiting transactions when T1 finishes. The simplest example case seems to be: T1: select id from job where name = 'a' for key share; Y: select id from job where name = 'a' for update; -- starts waiting for T1 Z: select id from job where name = 'a' for key share; T1: update job set name = 'b' where id = 1; Z: update job set name = 'c' where id = 1; -- starts waiting for T1 T1: rollback; At this point, transaction Y is rolled back on account of a deadlock: Y holds the heavyweight tuple lock and is waiting for the Xmax to be released, while Z holds part of the multixact and tries to acquire the heavyweight lock (per protocol) and goes to sleep; once T1 releases its part of the multixact, Z is awakened only to be put back to sleep on the heavyweight lock that Y is holding while sleeping. Kaboom. This can be avoided by having Z skip the heavyweight lock acquisition. As far as I can see, the biggest downside is that if there are multiple Z transactions, the order in which they resume after T1 finishes is not guaranteed. Backpatch to 9.6. The patch applies cleanly on 9.5, but the new tests don't work there (because isolationtester is not smart enough), so I'm not going to risk it. Author: Oleksii Kliukin Discussion: https://postgr.es/m/B9C9D7CD-EB94-4635-91B6-E558ACEC0EC3@hintbits.com Discussion: https://postgr.es/m/2815.1560521451@sss.pgh.pa.us
2019-06-19 00:23:16 +02:00
* with what we want. Set up to return true, but keep going to
* look for the current transaction among the multixact members,
* if needed.
*/
result = true;
}
pfree(members);
}
return result;
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Do_MultiXactIdWait
* Actual implementation for the two functions below.
*
* 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
* needed to ensure we only sleep on conflicting members, and the infomask is
* used to optimize multixact access in case it's a lock-only multi); 'nowait'
* indicates whether to use conditional lock acquisition, to allow callers to
* fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
* context information for error messages. 'remaining', if not NULL, receives
* the number of members that are still running, including any (non-aborted)
* subtransactions of our own transaction.
*
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
* We do this by sleeping on each member using XactLockTableWait. Any
* members that belong to the current backend are *not* waited for, however;
* this would not merely be useless but would lead to Assert failure inside
* XactLockTableWait. By the time this returns, it is certain that all
* transactions *of other backends* that were members of the MultiXactId
* that conflict with the requested status are dead (and no new ones can have
* been added, since it is not legal to add members to an existing
* MultiXactId).
*
* But by the time we finish sleeping, someone else may have changed the Xmax
* of the containing tuple, so the caller needs to iterate on us somehow.
*
* Note that in case we return false, the number of remaining members is
* not to be trusted.
*/
static bool
Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
uint16 infomask, bool nowait,
Relation rel, ItemPointer ctid, XLTW_Oper oper,
int *remaining)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
bool result = true;
MultiXactMember *members;
int nmembers;
int remain = 0;
Fix handling of multixacts predating pg_upgrade After pg_upgrade, it is possible that some tuples' Xmax have multixacts corresponding to the old installation; such multixacts cannot have running members anymore. In many code sites we already know not to read them and clobber them silently, but at least when VACUUM tries to freeze a multixact or determine whether one needs freezing, there's an attempt to resolve it to its member transactions by calling GetMultiXactIdMembers, and if the multixact value is "in the future" with regards to the current valid multixact range, an error like this is raised: ERROR: MultiXactId 123 has not been created yet -- apparent wraparound and vacuuming fails. Per discussion with Andrew Gierth, it is completely bogus to try to resolve multixacts coming from before a pg_upgrade, regardless of where they stand with regards to the current valid multixact range. It's possible to get from under this problem by doing SELECT FOR UPDATE of the problem tuples, but if tables are large, this is slow and tedious, so a more thorough solution is desirable. To fix, we realize that multixacts in xmax created in 9.2 and previous have a specific bit pattern that is never used in 9.3 and later (we already knew this, per comments and infomask tests sprinkled in various places, but we weren't leveraging this knowledge appropriately). Whenever the infomask of the tuple matches that bit pattern, we just ignore the multixact completely as if Xmax wasn't set; or, in the case of tuple freezing, we act as if an unwanted value is set and clobber it without decoding. This guarantees that no errors will be raised, and that the values will be progressively removed until all tables are clean. Most callers of GetMultiXactIdMembers are patched to recognize directly that the value is a removable "empty" multixact and avoid calling GetMultiXactIdMembers altogether. To avoid changing the signature of GetMultiXactIdMembers() in back branches, we keep the "allow_old" boolean flag but rename it to "from_pgupgrade"; if the flag is true, we always return an empty set instead of looking up the multixact. (I suppose we could remove the argument in the master branch, but I chose not to do so in this commit). This was broken all along, but the error-facing message appeared first because of commit 8e9a16ab8f7f and was partially fixed in a25c2b7c4db3. This fix, backpatched all the way back to 9.3, goes approximately in the same direction as a25c2b7c4db3 but should cover all cases. Bug analysis by Andrew Gierth and Álvaro Herrera. A number of public reports match this bug: https://www.postgresql.org/message-id/20140330040029.GY4582@tamriel.snowman.net https://www.postgresql.org/message-id/538F3D70.6080902@publicrelay.com https://www.postgresql.org/message-id/556439CF.7070109@pscs.co.uk https://www.postgresql.org/message-id/SG2PR06MB0760098A111C88E31BD4D96FB3540@SG2PR06MB0760.apcprd06.prod.outlook.com https://www.postgresql.org/message-id/20160615203829.5798.4594@wrigleys.postgresql.org
2016-06-25 00:29:28 +02:00
/* for pre-pg_upgrade tuples, no need to sleep at all */
nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
GetMultiXactIdMembers(multi, &members, false,
HEAP_XMAX_IS_LOCKED_ONLY(infomask));
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (nmembers >= 0)
{
int i;
for (i = 0; i < nmembers; i++)
{
TransactionId memxid = members[i].xid;
MultiXactStatus memstatus = members[i].status;
if (TransactionIdIsCurrentTransactionId(memxid))
{
remain++;
continue;
}
if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
LOCKMODE_from_mxstatus(status)))
{
if (remaining && TransactionIdIsInProgress(memxid))
remain++;
continue;
}
/*
* This member conflicts with our multi, so we have to sleep (or
* return failure, if asked to avoid waiting.)
*
* Note that we don't set up an error context callback ourselves,
* but instead we pass the info down to XactLockTableWait. This
* might seem a bit wasteful because the context is set up and
* tore down for each member of the multixact, but in reality it
* should be barely noticeable, and it avoids duplicate code.
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
*/
if (nowait)
{
result = ConditionalXactLockTableWait(memxid);
if (!result)
break;
}
else
XactLockTableWait(memxid, rel, ctid, oper);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
pfree(members);
}
if (remaining)
*remaining = remain;
return result;
}
/*
* MultiXactIdWait
* Sleep on a MultiXactId.
*
* By the time we finish sleeping, someone else may have changed the Xmax
* of the containing tuple, so the caller needs to iterate on us somehow.
*
* We return (in *remaining, if not NULL) the number of members that are still
* running, including any (non-aborted) subtransactions of our own transaction.
*/
static void
MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
Relation rel, ItemPointer ctid, XLTW_Oper oper,
int *remaining)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
(void) Do_MultiXactIdWait(multi, status, infomask, false,
rel, ctid, oper, remaining);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/*
* ConditionalMultiXactIdWait
* As above, but only lock if we can get the lock without blocking.
*
* By the time we finish sleeping, someone else may have changed the Xmax
* of the containing tuple, so the caller needs to iterate on us somehow.
*
* If the multixact is now all gone, return true. Returns false if some
* transactions might still be running.
*
* We return (in *remaining, if not NULL) the number of members that are still
* running, including any (non-aborted) subtransactions of our own transaction.
*/
static bool
ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
uint16 infomask, Relation rel, int *remaining)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
return Do_MultiXactIdWait(multi, status, infomask, true,
rel, NULL, XLTW_None, remaining);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
/*
* heap_tuple_needs_eventual_freeze
*
* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
* will eventually require freezing (if tuple isn't removed by pruning first).
*/
bool
heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
{
TransactionId xid;
/*
* If xmin is a normal transaction ID, this tuple is definitely not
* frozen.
*/
xid = HeapTupleHeaderGetXmin(tuple);
if (TransactionIdIsNormal(xid))
return true;
/*
* If xmax is a valid xact or multixact, this tuple is also not frozen.
*/
if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
{
MultiXactId multi;
multi = HeapTupleHeaderGetRawXmax(tuple);
if (MultiXactIdIsValid(multi))
return true;
}
else
{
xid = HeapTupleHeaderGetRawXmax(tuple);
if (TransactionIdIsNormal(xid))
return true;
}
if (tuple->t_infomask & HEAP_MOVED)
{
xid = HeapTupleHeaderGetXvac(tuple);
if (TransactionIdIsNormal(xid))
return true;
}
return false;
}
/*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* heap_tuple_should_freeze
*
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
* Return value indicates if heap_prepare_freeze_tuple sibling function would
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* (or should) force freezing of the heap page that contains caller's tuple.
* Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
* This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
*
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
* The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
* arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
* Our working assumption is that caller won't decide to freeze this tuple.
* It's up to caller to only ratchet back its own top-level trackers after the
* point that it fully commits to not freezing the tuple/page in question.
*/
bool
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
heap_tuple_should_freeze(HeapTupleHeader tuple,
const struct VacuumCutoffs *cutoffs,
TransactionId *NoFreezePageRelfrozenXid,
MultiXactId *NoFreezePageRelminMxid)
{
TransactionId xid;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
MultiXactId multi;
bool freeze = false;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/* First deal with xmin */
xid = HeapTupleHeaderGetXmin(tuple);
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
if (TransactionIdIsNormal(xid))
{
Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
*NoFreezePageRelfrozenXid = xid;
if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
freeze = true;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
}
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/* Now deal with xmax */
xid = InvalidTransactionId;
multi = InvalidMultiXactId;
if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
multi = HeapTupleHeaderGetRawXmax(tuple);
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
else
xid = HeapTupleHeaderGetRawXmax(tuple);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
if (TransactionIdIsNormal(xid))
{
Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/* xmax is a non-permanent XID */
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
*NoFreezePageRelfrozenXid = xid;
if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
freeze = true;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
}
else if (!MultiXactIdIsValid(multi))
{
/* xmax is a permanent XID or invalid MultiXactId/XID */
}
else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
{
/* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
*NoFreezePageRelminMxid = multi;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
freeze = true;
}
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
else
{
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/* xmax is a MultiXactId that may have an updater XID */
MultiXactMember *members;
int nmembers;
Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi));
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
*NoFreezePageRelminMxid = multi;
if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
freeze = true;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
/* need to check whether any member of the mxact is old */
nmembers = GetMultiXactIdMembers(multi, &members, false,
HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
for (int i = 0; i < nmembers; i++)
{
xid = members[i].xid;
Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
*NoFreezePageRelfrozenXid = xid;
if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
freeze = true;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
}
if (nmembers > 0)
pfree(members);
Fix a couple of bugs in MultiXactId freezing Both heap_freeze_tuple() and heap_tuple_needs_freeze() neglected to look into a multixact to check the members against cutoff_xid. This means that a very old Xid could survive hidden within a multi, possibly outliving its CLOG storage. In the distant future, this would cause clog lookup failures: ERROR: could not access status of transaction 3883960912 DETAIL: Could not open file "pg_clog/0E78": No such file or directory. This mostly was problematic when the updating transaction aborted, since in that case the row wouldn't get pruned away earlier in vacuum and the multixact could possibly survive for a long time. In many cases, data that is inaccessible for this reason way can be brought back heuristically. As a second bug, heap_freeze_tuple() didn't properly handle multixacts that need to be frozen according to cutoff_multi, but whose updater xid is still alive. Instead of preserving the update Xid, it just set Xmax invalid, which leads to both old and new tuple versions becoming visible. This is pretty rare in practice, but a real threat nonetheless. Existing corrupted rows, unfortunately, cannot be repaired in an automated fashion. Existing physical replicas might have already incorrectly frozen tuples because of different behavior than in master, which might only become apparent in the future once pg_multixact/ is truncated; it is recommended that all clones be rebuilt after upgrading. Following code analysis caused by bug report by J Smith in message CADFUPgc5bmtv-yg9znxV-vcfkb+JPRqs7m2OesQXaM_4Z1JpdQ@mail.gmail.com and privately by F-Secure. Backpatch to 9.3, where freezing of MultiXactIds was introduced. Analysis and patch by Andres Freund, with some tweaks by Álvaro.
2013-11-28 23:17:21 +01:00
}
if (tuple->t_infomask & HEAP_MOVED)
{
xid = HeapTupleHeaderGetXvac(tuple);
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
if (TransactionIdIsNormal(xid))
{
Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
Add page-level freezing to VACUUM. Teach VACUUM to decide on whether or not to trigger freezing at the level of whole heap pages. Individual XIDs and MXIDs fields from tuple headers now trigger freezing of whole pages, rather than independently triggering freezing of each individual tuple header field. Managing the cost of freezing over time now significantly influences when and how VACUUM freezes. The overall amount of WAL written is the single most important freezing related cost, in general. Freezing each page's tuples together in batch allows VACUUM to take full advantage of the freeze plan WAL deduplication optimization added by commit 9e540599. Also teach VACUUM to trigger page-level freezing whenever it detects that heap pruning generated an FPI. We'll have already written a large amount of WAL just to do that much, so it's very likely a good idea to get freezing out of the way for the page early. This only happens in cases where it will directly lead to marking the page all-frozen in the visibility map. In most cases "freezing a page" removes all XIDs < OldestXmin, and all MXIDs < OldestMxact. It doesn't quite work that way in certain rare cases involving MultiXacts, though. It is convenient to define "freeze the page" in a way that gives FreezeMultiXactId the leeway to put off the work of processing an individual tuple's xmax whenever it happens to be a MultiXactId that would require an expensive second pass to process aggressively (allocating a new multi is especially worth avoiding here). FreezeMultiXactId is eager when processing is cheap (as it usually is), and lazy in the event of an individual multi that happens to require expensive second pass processing. This avoids regressions related to processing of multis that page-level freezing might otherwise cause. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Jeff Davis <pgsql@j-davis.com> Reviewed-By: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com
2022-12-28 17:50:47 +01:00
if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
*NoFreezePageRelfrozenXid = xid;
/* heap_prepare_freeze_tuple forces xvac freezing */
freeze = true;
Set relfrozenxid to oldest extant XID seen by VACUUM. When VACUUM set relfrozenxid before now, it set it to whatever value was used to determine which tuples to freeze -- the FreezeLimit cutoff. This approach was very naive. The relfrozenxid invariant only requires that new relfrozenxid values be <= the oldest extant XID remaining in the table (at the point that the VACUUM operation ends), which in general might be much more recent than FreezeLimit. VACUUM now carefully tracks the oldest remaining XID/MultiXactId as it goes (the oldest remaining values _after_ lazy_scan_prune processing). The final values are set as the table's new relfrozenxid and new relminmxid in pg_class at the end of each VACUUM. The oldest XID might come from a tuple's xmin, xmax, or xvac fields. It might even come from one of the table's remaining MultiXacts. Final relfrozenxid values must still be >= FreezeLimit in an aggressive VACUUM (FreezeLimit still acts as a lower bound on the final value that aggressive VACUUM can set relfrozenxid to). Since standard VACUUMs still make no guarantees about advancing relfrozenxid, they might as well set relfrozenxid to a value from well before FreezeLimit when the opportunity presents itself. In general standard VACUUMs may now set relfrozenxid to any value > the original relfrozenxid and <= OldestXmin. Credit for the general idea of using the oldest extant XID to set pg_class.relfrozenxid at the end of VACUUM goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkymFbz6D_vL+jmqSn_5q1wsFvFrE+37yLgL_Rkfd6Gzg@mail.gmail.com
2022-04-03 18:57:21 +02:00
}
}
return freeze;
}
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* Maintain snapshotConflictHorizon for caller by ratcheting forward its value
* using any committed XIDs contained in 'tuple', an obsolescent heap tuple
* that caller is in the process of physically removing, e.g. via HOT pruning
* or index deletion.
*
* Caller must initialize its value to InvalidTransactionId, which is
* generally interpreted as "definitely no need for a recovery conflict".
* Final value must reflect all heap tuples that caller will physically remove
* (or remove TID references to) via its ongoing pruning/deletion operation.
* ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
* caller's WAL record) by REDO routine when it replays caller's operation.
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
*/
void
HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple,
TransactionId *snapshotConflictHorizon)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
{
TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
if (tuple->t_infomask & HEAP_MOVED)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
{
if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
*snapshotConflictHorizon = xvac;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
/*
* Ignore tuples inserted by an aborted transaction or if the tuple was
* updated/deleted by the inserting transaction.
*
* Look for a committed hint bit, or if no xmin bit is set, check clog.
*/
if (HeapTupleHeaderXminCommitted(tuple) ||
(!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
{
if (xmax != xmin &&
TransactionIdFollows(xmax, *snapshotConflictHorizon))
*snapshotConflictHorizon = xmax;
}
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#ifdef USE_PREFETCH
/*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* Helper function for heap_index_delete_tuples. Issues prefetch requests for
* prefetch_count buffers. The prefetch_state keeps track of all the buffers
* we can prefetch, and which have already been prefetched; each call to this
* function picks up where the previous call left off.
*
* Note: we expect the deltids array to be sorted in an order that groups TIDs
* by heap block, with all TIDs for each block appearing together in exactly
* one group.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
static void
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
index_delete_prefetch_buffer(Relation rel,
IndexDeletePrefetchState *prefetch_state,
int prefetch_count)
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
{
BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
int count = 0;
int i;
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
int ndeltids = prefetch_state->ndeltids;
TM_IndexDelete *deltids = prefetch_state->deltids;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
for (i = prefetch_state->next_item;
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
i < ndeltids && count < prefetch_count;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
i++)
{
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
ItemPointer htid = &deltids[i].tid;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
if (cur_hblkno == InvalidBlockNumber ||
ItemPointerGetBlockNumber(htid) != cur_hblkno)
{
cur_hblkno = ItemPointerGetBlockNumber(htid);
PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
count++;
}
}
/*
* Save the prefetch position so that next time we can continue from that
* position.
*/
prefetch_state->next_item = i;
prefetch_state->cur_hblkno = cur_hblkno;
}
#endif
/*
* Helper function for heap_index_delete_tuples. Checks for index corruption
* involving an invalid TID in index AM caller's index page.
*
* This is an ideal place for these checks. The index AM must hold a buffer
* lock on the index page containing the TIDs we examine here, so we don't
* have to worry about concurrent VACUUMs at all. We can be sure that the
* index is corrupt when htid points directly to an LP_UNUSED item or
* heap-only tuple, which is not the case during standard index scans.
*/
static inline void
index_delete_check_htid(TM_IndexDeleteOp *delstate,
Page page, OffsetNumber maxoff,
ItemPointer htid, TM_IndexStatus *istatus)
{
OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid);
ItemId iid;
Assert(OffsetNumberIsValid(istatus->idxoffnum));
if (unlikely(indexpagehoffnum > maxoff))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
ItemPointerGetBlockNumber(htid),
indexpagehoffnum,
istatus->idxoffnum, delstate->iblknum,
RelationGetRelationName(delstate->irel))));
iid = PageGetItemId(page, indexpagehoffnum);
if (unlikely(!ItemIdIsUsed(iid)))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
ItemPointerGetBlockNumber(htid),
indexpagehoffnum,
istatus->idxoffnum, delstate->iblknum,
RelationGetRelationName(delstate->irel))));
if (ItemIdHasStorage(iid))
{
HeapTupleHeader htup;
Assert(ItemIdIsNormal(iid));
htup = (HeapTupleHeader) PageGetItem(page, iid);
if (unlikely(HeapTupleHeaderIsHeapOnly(htup)))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
ItemPointerGetBlockNumber(htid),
indexpagehoffnum,
istatus->idxoffnum, delstate->iblknum,
RelationGetRelationName(delstate->irel))));
}
}
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* heapam implementation of tableam's index_delete_tuples interface.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* This helper function is called by index AMs during index tuple deletion.
* See tableam header comments for an explanation of the interface implemented
* here and a general theory of operation. Note that each call here is either
* a simple index deletion call, or a bottom-up index deletion call.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*
* It's possible for this to generate a fair amount of I/O, since we may be
* deleting hundreds of tuples from a single index block. To amortize that
* cost to some degree, this uses prefetching and combines repeat accesses to
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* the same heap block.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
TransactionId
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
{
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
/* Initial assumption is that earlier pruning took care of conflict */
TransactionId snapshotConflictHorizon = InvalidTransactionId;
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
BlockNumber blkno = InvalidBlockNumber;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
Buffer buf = InvalidBuffer;
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
Page page = NULL;
OffsetNumber maxoff = InvalidOffsetNumber;
TransactionId priorXmax;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#ifdef USE_PREFETCH
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
IndexDeletePrefetchState prefetch_state;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
int prefetch_distance;
#endif
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
SnapshotData SnapshotNonVacuumable;
int finalndeltids = 0,
nblocksaccessed = 0;
/* State that's only used in bottom-up index deletion case */
int nblocksfavorable = 0;
int curtargetfreespace = delstate->bottomupfreespace,
lastfreespace = 0,
actualfreespace = 0;
bool bottomup_final_block = false;
InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
/* Sort caller's deltids array by TID for further processing */
index_delete_sort(delstate);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* Bottom-up case: resort deltids array in an order attuned to where the
* greatest number of promising TIDs are to be found, and determine how
* many blocks from the start of sorted array should be considered
* favorable. This will also shrink the deltids array in order to
* eliminate completely unfavorable blocks up front.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
if (delstate->bottomup)
nblocksfavorable = bottomup_sort_and_shrink(delstate);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#ifdef USE_PREFETCH
/* Initialize prefetch state. */
prefetch_state.cur_hblkno = InvalidBlockNumber;
prefetch_state.next_item = 0;
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
prefetch_state.ndeltids = delstate->ndeltids;
prefetch_state.deltids = delstate->deltids;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* Determine the prefetch distance that we will attempt to maintain.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*
* Since the caller holds a buffer lock somewhere in rel, we'd better make
* sure that isn't a catalog relation before we call code that does
* syscache lookups, to avoid risk of deadlock.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
if (IsCatalogRelation(rel))
prefetch_distance = maintenance_io_concurrency;
else
prefetch_distance =
get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
/* Cap initial prefetch distance for bottom-up deletion caller */
if (delstate->bottomup)
{
Assert(nblocksfavorable >= 1);
Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS);
prefetch_distance = Min(prefetch_distance, nblocksfavorable);
}
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/* Start prefetching. */
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#endif
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
/* Iterate over deltids, determine which to delete, check their horizon */
Assert(delstate->ndeltids > 0);
for (int i = 0; i < delstate->ndeltids; i++)
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
{
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
TM_IndexDelete *ideltid = &delstate->deltids[i];
TM_IndexStatus *istatus = delstate->status + ideltid->id;
ItemPointer htid = &ideltid->tid;
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
OffsetNumber offnum;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* Read buffer, and perform required extra steps each time a new block
* is encountered. Avoid refetching if it's the same block as the one
* from the last htid.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
if (blkno == InvalidBlockNumber ||
ItemPointerGetBlockNumber(htid) != blkno)
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
{
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
/*
* Consider giving up early for bottom-up index deletion caller
* first. (Only prefetch next-next block afterwards, when it
* becomes clear that we're at least going to access the next
* block in line.)
*
* Sometimes the first block frees so much space for bottom-up
* caller that the deletion process can end without accessing any
* more blocks. It is usually necessary to access 2 or 3 blocks
* per bottom-up deletion operation, though.
*/
if (delstate->bottomup)
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
{
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
/*
* We often allow caller to delete a few additional items
* whose entries we reached after the point that space target
* from caller was satisfied. The cost of accessing the page
* was already paid at that point, so it made sense to finish
* it off. When that happened, we finalize everything here
* (by finishing off the whole bottom-up deletion operation
* without needlessly paying the cost of accessing any more
* blocks).
*/
if (bottomup_final_block)
break;
/*
* Give up when we didn't enable our caller to free any
* additional space as a result of processing the page that we
* just finished up with. This rule is the main way in which
* we keep the cost of bottom-up deletion under control.
*/
if (nblocksaccessed >= 1 && actualfreespace == lastfreespace)
break;
lastfreespace = actualfreespace; /* for next time */
/*
* Deletion operation (which is bottom-up) will definitely
* access the next block in line. Prepare for that now.
*
* Decay target free space so that we don't hang on for too
* long with a marginal case. (Space target is only truly
* helpful when it allows us to recognize that we don't need
* to access more than 1 or 2 blocks to satisfy caller due to
* agreeable workload characteristics.)
*
* We are a bit more patient when we encounter contiguous
* blocks, though: these are treated as favorable blocks. The
* decay process is only applied when the next block in line
* is not a favorable/contiguous block. This is not an
* exception to the general rule; we still insist on finding
* at least one deletable item per block accessed. See
* bottomup_nblocksfavorable() for full details of the theory
* behind favorable blocks and heap block locality in general.
*
* Note: The first block in line is always treated as a
* favorable block, so the earliest possible point that the
* decay can be applied is just before we access the second
* block in line. The Assert() verifies this for us.
*/
Assert(nblocksaccessed > 0 || nblocksfavorable > 0);
if (nblocksfavorable > 0)
nblocksfavorable--;
else
curtargetfreespace /= 2;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
}
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
/* release old buffer */
if (BufferIsValid(buf))
UnlockReleaseBuffer(buf);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
blkno = ItemPointerGetBlockNumber(htid);
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
buf = ReadBuffer(rel, blkno);
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
nblocksaccessed++;
Assert(!delstate->bottomup ||
nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#ifdef USE_PREFETCH
/*
* To maintain the prefetch distance, prefetch one more page for
* each page we read.
*/
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
index_delete_prefetch_buffer(rel, &prefetch_state, 1);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
#endif
LockBuffer(buf, BUFFER_LOCK_SHARE);
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
page = BufferGetPage(buf);
maxoff = PageGetMaxOffsetNumber(page);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
}
/*
* In passing, detect index corruption involving an index page with a
* TID that points to a location in the heap that couldn't possibly be
* correct. We only do this with actual TIDs from caller's index page
* (not items reached by traversing through a HOT chain).
*/
index_delete_check_htid(delstate, page, maxoff, htid, istatus);
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
if (istatus->knowndeletable)
Assert(!delstate->bottomup && !istatus->promising);
else
{
ItemPointerData tmp = *htid;
HeapTupleData heapTuple;
/* Are any tuples from this HOT chain non-vacuumable? */
if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable,
&heapTuple, NULL, true))
continue; /* can't delete entry */
/* Caller will delete, since whole HOT chain is vacuumable */
istatus->knowndeletable = true;
/* Maintain index free space info for bottom-up deletion case */
if (delstate->bottomup)
{
Assert(istatus->freespace > 0);
actualfreespace += istatus->freespace;
if (actualfreespace >= curtargetfreespace)
bottomup_final_block = true;
}
}
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/*
* Maintain snapshotConflictHorizon value for deletion operation as a
* whole by advancing current value using heap tuple headers. This is
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
* loosely based on the logic for pruning a HOT chain.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
offnum = ItemPointerGetOffsetNumber(htid);
priorXmax = InvalidTransactionId; /* cannot check first XMIN */
for (;;)
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
{
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
ItemId lp;
HeapTupleHeader htup;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/* Sanity check (pure paranoia) */
if (offnum < FirstOffsetNumber)
break;
/*
* An offset past the end of page's line pointer array is possible
* when the array was truncated
*/
if (offnum > maxoff)
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
break;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
lp = PageGetItemId(page, offnum);
if (ItemIdIsRedirected(lp))
{
offnum = ItemIdGetRedirect(lp);
continue;
}
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
/*
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* We'll often encounter LP_DEAD line pointers (especially with an
* entry marked knowndeletable by our caller up front). No heap
* tuple headers get examined for an htid that leads us to an
* LP_DEAD item. This is okay because the earlier pruning
* operation that made the line pointer LP_DEAD in the first place
* must have considered the original tuple header as part of
* generating its own snapshotConflictHorizon value.
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
*
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
* Relying on XLOG_HEAP2_PRUNE records like this is the same
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
* strategy that index vacuuming uses in all cases. Index VACUUM
* WAL records don't even have a snapshotConflictHorizon field of
* their own for this reason.
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
*/
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
if (!ItemIdIsNormal(lp))
break;
htup = (HeapTupleHeader) PageGetItem(page, lp);
/*
* Check the tuple XMIN against prior XMAX, if any
*/
if (TransactionIdIsValid(priorXmax) &&
!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
break;
HeapTupleHeaderAdvanceConflictHorizon(htup,
&snapshotConflictHorizon);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
Fix index deletion latestRemovedXid bug. The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e08). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-31 01:29:05 +01:00
/*
* If the tuple is not HOT-updated, then we are at the end of this
* HOT-chain. No need to visit later tuples from the same update
* chain (they get their own index entries) -- just move on to
* next htid from index AM caller.
*/
if (!HeapTupleHeaderIsHotUpdated(htup))
break;
/* Advance to next HOT chain member */
Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
priorXmax = HeapTupleHeaderGetUpdateXid(htup);
}
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
/* Enable further/final shrinking of deltids for caller */
finalndeltids = i + 1;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
}
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
UnlockReleaseBuffer(buf);
/*
* Shrink deltids array to exclude non-deletable entries at the end. This
* is not just a minor optimization. Final deltids array size might be
* zero for a bottom-up caller. Index AM is explicitly allowed to rely on
* ndeltids being zero in all cases with zero total deletable entries.
*/
Assert(finalndeltids > 0 || delstate->bottomup);
delstate->ndeltids = finalndeltids;
return snapshotConflictHorizon;
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
}
/*
* Specialized inlineable comparison function for index_delete_sort()
*/
static inline int
index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
{
ItemPointer tid1 = &deltid1->tid;
ItemPointer tid2 = &deltid2->tid;
{
BlockNumber blk1 = ItemPointerGetBlockNumber(tid1);
BlockNumber blk2 = ItemPointerGetBlockNumber(tid2);
if (blk1 != blk2)
return (blk1 < blk2) ? -1 : 1;
}
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
{
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1);
OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2);
if (pos1 != pos2)
return (pos1 < pos2) ? -1 : 1;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
}
Assert(false);
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
return 0;
}
/*
* Sort deltids array from delstate by TID. This prepares it for further
* processing by heap_index_delete_tuples().
*
* This operation becomes a noticeable consumer of CPU cycles with some
* workloads, so we go to the trouble of specialization/micro optimization.
* We use shellsort for this because it's easy to specialize, compiles to
* relatively few instructions, and is adaptive to presorted inputs/subsets
* (which are typical here).
*/
static void
index_delete_sort(TM_IndexDeleteOp *delstate)
{
TM_IndexDelete *deltids = delstate->deltids;
int ndeltids = delstate->ndeltids;
int low = 0;
/*
* Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
*
* This implementation is fast with array sizes up to ~4500. This covers
* all supported BLCKSZ values.
*/
const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
/* Think carefully before changing anything here -- keep swaps cheap */
StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
"element size exceeds 8 bytes");
for (int g = 0; g < lengthof(gaps); g++)
{
for (int hi = gaps[g], i = low + hi; i < ndeltids; i++)
{
TM_IndexDelete d = deltids[i];
int j = i;
while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
{
deltids[j] = deltids[j - hi];
j -= hi;
}
deltids[j] = d;
}
}
}
/*
* Returns how many blocks should be considered favorable/contiguous for a
* bottom-up index deletion pass. This is a number of heap blocks that starts
* from and includes the first block in line.
*
* There is always at least one favorable block during bottom-up index
* deletion. In the worst case (i.e. with totally random heap blocks) the
* first block in line (the only favorable block) can be thought of as a
* degenerate array of contiguous blocks that consists of a single block.
* heap_index_delete_tuples() will expect this.
*
* Caller passes blockgroups, a description of the final order that deltids
* will be sorted in for heap_index_delete_tuples() bottom-up index deletion
* processing. Note that deltids need not actually be sorted just yet (caller
* only passes deltids to us so that we can interpret blockgroups).
*
* You might guess that the existence of contiguous blocks cannot matter much,
* since in general the main factor that determines which blocks we visit is
* the number of promising TIDs, which is a fixed hint from the index AM.
* We're not really targeting the general case, though -- the actual goal is
* to adapt our behavior to a wide variety of naturally occurring conditions.
* The effects of most of the heuristics we apply are only noticeable in the
* aggregate, over time and across many _related_ bottom-up index deletion
* passes.
*
* Deeming certain blocks favorable allows heapam to recognize and adapt to
* workloads where heap blocks visited during bottom-up index deletion can be
* accessed contiguously, in the sense that each newly visited block is the
* neighbor of the block that bottom-up deletion just finished processing (or
* close enough to it). It will likely be cheaper to access more favorable
* blocks sooner rather than later (e.g. in this pass, not across a series of
* related bottom-up passes). Either way it is probably only a matter of time
* (or a matter of further correlated version churn) before all blocks that
* appear together as a single large batch of favorable blocks get accessed by
* _some_ bottom-up pass. Large batches of favorable blocks tend to either
* appear almost constantly or not even once (it all depends on per-index
* workload characteristics).
*
* Note that the blockgroups sort order applies a power-of-two bucketing
* scheme that creates opportunities for contiguous groups of blocks to get
* batched together, at least with workloads that are naturally amenable to
* being driven by heap block locality. This doesn't just enhance the spatial
* locality of bottom-up heap block processing in the obvious way. It also
* enables temporal locality of access, since sorting by heap block number
* naturally tends to make the bottom-up processing order deterministic.
*
* Consider the following example to get a sense of how temporal locality
* might matter: There is a heap relation with several indexes, each of which
* is low to medium cardinality. It is subject to constant non-HOT updates.
* The updates are skewed (in one part of the primary key, perhaps). None of
* the indexes are logically modified by the UPDATE statements (if they were
* then bottom-up index deletion would not be triggered in the first place).
* Naturally, each new round of index tuples (for each heap tuple that gets a
* heap_update() call) will have the same heap TID in each and every index.
* Since these indexes are low cardinality and never get logically modified,
* heapam processing during bottom-up deletion passes will access heap blocks
* in approximately sequential order. Temporal locality of access occurs due
* to bottom-up deletion passes behaving very similarly across each of the
* indexes at any given moment. This keeps the number of buffer misses needed
* to visit heap blocks to a minimum.
*/
static int
bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups,
TM_IndexDelete *deltids)
{
int64 lastblock = -1;
int nblocksfavorable = 0;
Assert(nblockgroups >= 1);
Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS);
/*
* We tolerate heap blocks that will be accessed only slightly out of
* physical order. Small blips occur when a pair of almost-contiguous
* blocks happen to fall into different buckets (perhaps due only to a
* small difference in npromisingtids that the bucketing scheme didn't
* quite manage to ignore). We effectively ignore these blips by applying
* a small tolerance. The precise tolerance we use is a little arbitrary,
* but it works well enough in practice.
*/
for (int b = 0; b < nblockgroups; b++)
{
IndexDeleteCounts *group = blockgroups + b;
TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid);
if (lastblock != -1 &&
((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS ||
(int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS))
break;
nblocksfavorable++;
lastblock = block;
}
/* Always indicate that there is at least 1 favorable block */
Assert(nblocksfavorable >= 1);
return nblocksfavorable;
}
/*
* qsort comparison function for bottomup_sort_and_shrink()
*/
static int
bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
{
const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1;
const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2;
/*
* Most significant field is npromisingtids (which we invert the order of
* so as to sort in desc order).
*
* Caller should have already normalized npromisingtids fields into
* power-of-two values (buckets).
*/
if (group1->npromisingtids > group2->npromisingtids)
return -1;
if (group1->npromisingtids < group2->npromisingtids)
return 1;
/*
* Tiebreak: desc ntids sort order.
*
* We cannot expect power-of-two values for ntids fields. We should
* behave as if they were already rounded up for us instead.
*/
if (group1->ntids != group2->ntids)
{
uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids);
uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids);
if (ntids1 > ntids2)
return -1;
if (ntids1 < ntids2)
return 1;
}
/*
* Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
* block in deltids array) order.
*
* This is equivalent to sorting in ascending heap block number order
* (among otherwise equal subsets of the array). This approach allows us
* to avoid accessing the out-of-line TID. (We rely on the assumption
* that the deltids array was sorted in ascending heap TID order when
* these offsets to the first TID from each heap block group were formed.)
*/
if (group1->ifirsttid > group2->ifirsttid)
return 1;
if (group1->ifirsttid < group2->ifirsttid)
return -1;
pg_unreachable();
return 0;
}
/*
* heap_index_delete_tuples() helper function for bottom-up deletion callers.
*
* Sorts deltids array in the order needed for useful processing by bottom-up
* deletion. The array should already be sorted in TID order when we're
* called. The sort process groups heap TIDs from deltids into heap block
* groupings. Earlier/more-promising groups/blocks are usually those that are
* known to have the most "promising" TIDs.
*
* Sets new size of deltids array (ndeltids) in state. deltids will only have
* TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
* return. This often means that deltids will be shrunk to a small fraction
* of its original size (we eliminate many heap blocks from consideration for
* caller up front).
*
* Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
* for a definition and full details.
*/
static int
bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
{
IndexDeleteCounts *blockgroups;
TM_IndexDelete *reordereddeltids;
BlockNumber curblock = InvalidBlockNumber;
int nblockgroups = 0;
int ncopied = 0;
int nblocksfavorable = 0;
Assert(delstate->bottomup);
Assert(delstate->ndeltids > 0);
/* Calculate per-heap-block count of TIDs */
blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
for (int i = 0; i < delstate->ndeltids; i++)
{
TM_IndexDelete *ideltid = &delstate->deltids[i];
TM_IndexStatus *istatus = delstate->status + ideltid->id;
ItemPointer htid = &ideltid->tid;
bool promising = istatus->promising;
if (curblock != ItemPointerGetBlockNumber(htid))
{
/* New block group */
nblockgroups++;
Assert(curblock < ItemPointerGetBlockNumber(htid) ||
!BlockNumberIsValid(curblock));
curblock = ItemPointerGetBlockNumber(htid);
blockgroups[nblockgroups - 1].ifirsttid = i;
blockgroups[nblockgroups - 1].ntids = 1;
blockgroups[nblockgroups - 1].npromisingtids = 0;
}
else
{
blockgroups[nblockgroups - 1].ntids++;
}
if (promising)
blockgroups[nblockgroups - 1].npromisingtids++;
}
/*
* We're about ready to sort block groups to determine the optimal order
* for visiting heap blocks. But before we do, round the number of
* promising tuples for each block group up to the next power-of-two,
* unless it is very low (less than 4), in which case we round up to 4.
* npromisingtids is far too noisy to trust when choosing between a pair
* of block groups that both have very low values.
Enhance nbtree index tuple deletion. Teach nbtree and heapam to cooperate in order to eagerly remove duplicate tuples representing dead MVCC versions. This is "bottom-up deletion". Each bottom-up deletion pass is triggered lazily in response to a flood of versions on an nbtree leaf page. This usually involves a "logically unchanged index" hint (these are produced by the executor mechanism added by commit 9dc718bd). The immediate goal of bottom-up index deletion is to avoid "unnecessary" page splits caused entirely by version duplicates. It naturally has an even more useful effect, though: it acts as a backstop against accumulating an excessive number of index tuple versions for any given _logical row_. Bottom-up index deletion complements what we might now call "top-down index deletion": index vacuuming performed by VACUUM. Bottom-up index deletion responds to the immediate local needs of queries, while leaving it up to autovacuum to perform infrequent clean sweeps of the index. The overall effect is to avoid certain pathological performance issues related to "version churn" from UPDATEs. The previous tableam interface used by index AMs to perform tuple deletion (the table_compute_xid_horizon_for_tuples() function) has been replaced with a new interface that supports certain new requirements. Many (perhaps all) of the capabilities added to nbtree by this commit could also be extended to other index AMs. That is left as work for a later commit. Extend deletion of LP_DEAD-marked index tuples in nbtree by adding logic to consider extra index tuples (that are not LP_DEAD-marked) for deletion in passing. This increases the number of index tuples deleted significantly in many cases. The LP_DEAD deletion process (which is now called "simple deletion" to clearly distinguish it from bottom-up deletion) won't usually need to visit any extra table blocks to check these extra tuples. We have to visit the same table blocks anyway to generate a latestRemovedXid value (at least in the common case where the index deletion operation's WAL record needs such a value). Testing has shown that the "extra tuples" simple deletion enhancement increases the number of index tuples deleted with almost any workload that has LP_DEAD bits set in leaf pages. That is, it almost never fails to delete at least a few extra index tuples. It helps most of all in cases that happen to naturally have a lot of delete-safe tuples. It's not uncommon for an individual deletion operation to end up deleting an order of magnitude more index tuples compared to the old naive approach (e.g., custom instrumentation of the patch shows that this happens fairly often when the regression tests are run). Add a further enhancement that augments simple deletion and bottom-up deletion in indexes that make use of deduplication: Teach nbtree's _bt_delitems_delete() function to support granular TID deletion in posting list tuples. It is now possible to delete individual TIDs from posting list tuples provided the TIDs have a tableam block number of a table block that gets visited as part of the deletion process (visiting the table block can be triggered directly or indirectly). Setting the LP_DEAD bit of a posting list tuple is still an all-or-nothing thing, but that matters much less now that deletion only needs to start out with the right _general_ idea about which index tuples are deletable. Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. No bump in BTREE_VERSION, since there are no changes to the on-disk representation of nbtree indexes. Indexes built on PostgreSQL 12 or PostgreSQL 13 will automatically benefit from bottom-up index deletion (i.e. no reindexing required) following a pg_upgrade. The enhancement to simple deletion is available with all B-Tree indexes following a pg_upgrade, no matter what PostgreSQL version the user upgrades from. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Victor Yegorov <vyegorov@gmail.com> Discussion: https://postgr.es/m/CAH2-Wzm+maE3apHB8NOtmM=p-DO65j2V5GzAWCOEEuy3JZgb2g@mail.gmail.com
2021-01-13 18:21:32 +01:00
*
* This scheme divides heap blocks/block groups into buckets. Each bucket
* contains blocks that have _approximately_ the same number of promising
* TIDs as each other. The goal is to ignore relatively small differences
* in the total number of promising entries, so that the whole process can
* give a little weight to heapam factors (like heap block locality)
* instead. This isn't a trade-off, really -- we have nothing to lose. It
* would be foolish to interpret small differences in npromisingtids
* values as anything more than noise.
*
* We tiebreak on nhtids when sorting block group subsets that have the
* same npromisingtids, but this has the same issues as npromisingtids,
* and so nhtids is subject to the same power-of-two bucketing scheme. The
* only reason that we don't fix nhtids in the same way here too is that
* we'll need accurate nhtids values after the sort. We handle nhtids
* bucketization dynamically instead (in the sort comparator).
*
* See bottomup_nblocksfavorable() for a full explanation of when and how
* heap locality/favorable blocks can significantly influence when and how
* heap blocks are accessed.
*/
for (int b = 0; b < nblockgroups; b++)
{
IndexDeleteCounts *group = blockgroups + b;
/* Better off falling back on nhtids with low npromisingtids */
if (group->npromisingtids <= 4)
group->npromisingtids = 4;
else
group->npromisingtids =
pg_nextpower2_32((uint32) group->npromisingtids);
}
/* Sort groups and rearrange caller's deltids array */
qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts),
bottomup_sort_and_shrink_cmp);
reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups);
/* Determine number of favorable blocks at the start of final deltids */
nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups,
delstate->deltids);
for (int b = 0; b < nblockgroups; b++)
{
IndexDeleteCounts *group = blockgroups + b;
TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
memcpy(reordereddeltids + ncopied, firstdtid,
sizeof(TM_IndexDelete) * group->ntids);
ncopied += group->ntids;
}
/* Copy final grouped and sorted TIDs back into start of caller's array */
memcpy(delstate->deltids, reordereddeltids,
sizeof(TM_IndexDelete) * ncopied);
delstate->ndeltids = ncopied;
pfree(reordereddeltids);
pfree(blockgroups);
return nblocksfavorable;
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
}
/*
* Perform XLogInsert for a heap-visible operation. 'block' is the block
* being marked all-visible, and vm_buffer is the buffer containing the
* corresponding visibility map block. Both should have already been modified
* and dirtied.
*
* snapshotConflictHorizon comes from the largest xmin on the page being
* marked all-visible. REDO routine uses it to generate recovery conflicts.
*
* If checksums or wal_log_hints are enabled, we may also generate a full-page
* image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
* REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
* update the heap page's LSN.
*/
XLogRecPtr
log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer,
TransactionId snapshotConflictHorizon, uint8 vmflags)
{
xl_heap_visible xlrec;
XLogRecPtr recptr;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
uint8 flags;
Assert(BufferIsValid(heap_buffer));
Assert(BufferIsValid(vm_buffer));
xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
xlrec.flags = vmflags;
Add info in WAL records in preparation for logical slot conflict handling This commit only implements one prerequisite part for allowing logical decoding. The commit message contains an explanation of the overall design, which later commits will refer back to. Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment. It's not needed on the others struct where isCatalogRel has been added. This commit just introduces the WAL format changes mentioned above. Handling the actual conflicts will follow in future commits. Bumps XLOG_PAGE_MAGIC as the several WAL records are changed. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> (in an older version) Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2023-04-02 21:32:19 +02:00
if (RelationIsAccessibleInLogicalDecoding(rel))
xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBuffer(0, vm_buffer, 0);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
flags = REGBUF_STANDARD;
if (!XLogHintBitIsNeeded())
flags |= REGBUF_NO_IMAGE;
XLogRegisterBuffer(1, heap_buffer, flags);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
return recptr;
}
/*
* Perform XLogInsert for a heap-update operation. Caller must already
* have modified the buffer(s) and marked them dirty.
*/
static XLogRecPtr
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
log_heap_update(Relation reln, Buffer oldbuf,
Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
HeapTuple old_key_tuple,
bool all_visible_cleared, bool new_all_visible_cleared)
{
xl_heap_update xlrec;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xl_heap_header xlhdr;
xl_heap_header xlhdr_idx;
uint8 info;
uint16 prefix_suffix[2];
uint16 prefixlen = 0,
suffixlen = 0;
XLogRecPtr recptr;
Page page = BufferGetPage(newbuf);
bool need_tuple_data = RelationIsLogicallyLogged(reln);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
bool init;
int bufflags;
/* Caller should not call me on a non-WAL-logged relation */
Assert(RelationNeedsWAL(reln));
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogBeginInsert();
if (HeapTupleIsHeapOnly(newtup))
info = XLOG_HEAP_HOT_UPDATE;
else
info = XLOG_HEAP_UPDATE;
/*
* If the old and new tuple are on the same page, we only need to log the
* parts of the new tuple that were changed. That saves on the amount of
* WAL we need to write. Currently, we just count any unchanged bytes in
* the beginning and end of the tuple. That's quick to check, and
* perfectly covers the common case that only one field is updated.
*
* We could do this even if the old and new tuple are on different pages,
* but only if we don't make a full-page image of the old page, which is
* difficult to know in advance. Also, if the old tuple is corrupt for
* some reason, it would allow the corruption to propagate the new page,
* so it seems best to avoid. Under the general assumption that most
* updates tend to create the new tuple version on the same page, there
* isn't much to be gained by doing this across pages anyway.
*
* Skip this if we're taking a full-page image of the new page, as we
* don't include the new tuple in the WAL record in that case. Also
* disable if wal_level='logical', as logical decoding needs to be able to
* read the new tuple in whole from the WAL record alone.
*/
if (oldbuf == newbuf && !need_tuple_data &&
!XLogCheckBufferNeedsBackup(newbuf))
{
char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
int newlen = newtup->t_len - newtup->t_data->t_hoff;
/* Check for common prefix between old and new tuple */
for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
{
if (newp[prefixlen] != oldp[prefixlen])
break;
}
/*
* Storing the length of the prefix takes 2 bytes, so we need to save
* at least 3 bytes or there's no point.
*/
if (prefixlen < 3)
prefixlen = 0;
/* Same for suffix */
for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
{
if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
break;
}
if (suffixlen < 3)
suffixlen = 0;
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* Prepare main WAL data chain */
xlrec.flags = 0;
if (all_visible_cleared)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
if (new_all_visible_cleared)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
if (prefixlen > 0)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
if (suffixlen > 0)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (need_tuple_data)
{
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (old_key_tuple)
{
if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
else
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
}
}
/* If new tuple is the single and first tuple on page... */
if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
{
info |= XLOG_HEAP_INIT_PAGE;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
init = true;
}
else
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
init = false;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* Prepare WAL data for the old page */
xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
oldtup->t_data->t_infomask2);
/* Prepare WAL data for the new page */
xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
bufflags = REGBUF_STANDARD;
if (init)
bufflags |= REGBUF_WILL_INIT;
if (need_tuple_data)
bufflags |= REGBUF_KEEP_DATA;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBuffer(0, newbuf, bufflags);
if (oldbuf != newbuf)
XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
/*
* Prepare WAL data for the new tuple.
*/
if (prefixlen > 0 || suffixlen > 0)
{
if (prefixlen > 0 && suffixlen > 0)
{
prefix_suffix[0] = prefixlen;
prefix_suffix[1] = suffixlen;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
}
else if (prefixlen > 0)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
}
else
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
}
}
2003-08-04 02:43:34 +02:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
xlhdr.t_infomask = newtup->t_data->t_infomask;
xlhdr.t_hoff = newtup->t_data->t_hoff;
Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
/*
* PG73FORMAT: write bitmap [+ padding] [+ oid] + data
*
* The 'data' doesn't include the common prefix or suffix.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
if (prefixlen == 0)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0,
((char *) newtup->t_data) + SizeofHeapTupleHeader,
newtup->t_len - SizeofHeapTupleHeader - suffixlen);
}
else
{
/*
* Have to write the null bitmap and data after the common prefix as
* two separate rdata entries.
*/
/* bitmap [+ padding] [+ oid] */
if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0,
((char *) newtup->t_data) + SizeofHeapTupleHeader,
newtup->t_data->t_hoff - SizeofHeapTupleHeader);
}
/* data after common prefix */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterBufData(0,
((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* We need to log a tuple identity */
if (need_tuple_data && old_key_tuple)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* don't really need this, but its more comfy to decode */
xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
old_key_tuple->t_len - SizeofHeapTupleHeader);
}
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/* filtering by origin on a row level is much more efficient */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP_ID, info);
2000-10-24 11:56:23 +02:00
return recptr;
2000-10-24 11:56:23 +02:00
}
/*
* Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
*
* This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
* tuples.
*/
static XLogRecPtr
log_heap_new_cid(Relation relation, HeapTuple tup)
{
xl_heap_new_cid xlrec;
XLogRecPtr recptr;
HeapTupleHeader hdr = tup->t_data;
Assert(ItemPointerIsValid(&tup->t_self));
Assert(tup->t_tableOid != InvalidOid);
xlrec.top_xid = GetTopTransactionId();
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
xlrec.target_locator = relation->rd_locator;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlrec.target_tid = tup->t_self;
/*
* If the tuple got inserted & deleted in the same TX we definitely have a
* combo CID, set cmin and cmax.
*/
if (hdr->t_infomask & HEAP_COMBOCID)
{
Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
Assert(!HeapTupleHeaderXminInvalid(hdr));
xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
}
/* No combo CID, so only cmin or cmax can be set by this TX */
else
{
/*
* Tuple inserted.
*
* We need to check for LOCK ONLY because multixacts might be
* transferred to the new tuple in case of FOR KEY SHARE updates in
* which case there will be an xmax, although the tuple just got
* inserted.
*/
if (hdr->t_infomask & HEAP_XMAX_INVALID ||
HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
{
xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
xlrec.cmax = InvalidCommandId;
}
/* Tuple from a different tx updated or deleted. */
else
{
xlrec.cmin = InvalidCommandId;
xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
}
xlrec.combocid = InvalidCommandId;
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/*
* Note that we don't need to register the buffer here, because this
* operation does not modify the page. The insert/update/delete that
* called us certainly did, but that's WAL-logged separately.
*/
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
/* will be looked at irrespective of origin */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
return recptr;
}
/*
* Build a heap tuple representing the configured REPLICA IDENTITY to represent
* the old tuple in an UPDATE or DELETE.
*
* Returns NULL if there's no need to log an identity or if there's no suitable
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
* key defined.
*
* Pass key_required true if any replica identity columns changed value, or if
* any of them have any external data. Delete must always pass true.
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
*
* *copy is set to true if the returned tuple is a modified copy rather than
* the same tuple that was passed in.
*/
static HeapTuple
ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
bool *copy)
{
TupleDesc desc = RelationGetDescr(relation);
char replident = relation->rd_rel->relreplident;
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
Bitmapset *idattrs;
HeapTuple key_tuple;
bool nulls[MaxHeapAttributeNumber];
Datum values[MaxHeapAttributeNumber];
*copy = false;
if (!RelationIsLogicallyLogged(relation))
return NULL;
if (replident == REPLICA_IDENTITY_NOTHING)
return NULL;
if (replident == REPLICA_IDENTITY_FULL)
{
/*
* When logging the entire old tuple, it very well could contain
* toasted columns. If so, force them to be inlined.
*/
if (HeapTupleHasExternal(tp))
{
*copy = true;
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
tp = toast_flatten_tuple(tp, desc);
}
return tp;
}
/* if the key isn't required and we're only logging the key, we're done */
if (!key_required)
return NULL;
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
/* find out the replica identity columns */
idattrs = RelationGetIndexAttrBitmap(relation,
INDEX_ATTR_BITMAP_IDENTITY_KEY);
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
/*
* If there's no defined replica identity columns, treat as !key_required.
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
* (This case should not be reachable from heap_update, since that should
* calculate key_required accurately. But heap_delete just passes
* constant true for key_required, so we can hit this case in deletes.)
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
*/
if (bms_is_empty(idattrs))
return NULL;
/*
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
* Construct a new tuple containing only the replica identity columns,
* with nulls elsewhere. While we're at it, assert that the replica
* identity columns aren't null.
*/
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
heap_deform_tuple(tp, desc, values, nulls);
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
for (int i = 0; i < desc->natts; i++)
{
if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
idattrs))
Assert(!nulls[i]);
else
nulls[i] = true;
}
key_tuple = heap_form_tuple(desc, values, nulls);
*copy = true;
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
bms_free(idattrs);
/*
* If the tuple, which by here only contains indexed columns, still has
* toasted columns, force them to be inlined. This is somewhat unlikely
* since there's limits on the size of indexed columns, so we don't
* duplicate toast_flatten_tuple()s functionality in the above loop over
* the indexed columns, even if it would be more efficient.
*/
if (HeapTupleHasExternal(key_tuple))
{
HeapTuple oldtup = key_tuple;
Avoid touching replica identity index in ExtractReplicaIdentity(). In what seems like a fit of misplaced optimization, ExtractReplicaIdentity() accessed the relation's replica-identity index without taking any lock on it. Usually, the surrounding query already holds some lock so this is safe enough ... but in the case of a previously-planned delete, there might be no existing lock. Given a suitable test case, this is exposed in v12 and HEAD by an assertion added by commit b04aeb0a0. The whole thing's rather poorly thought out anyway; rather than looking directly at the index, we should use the index-attributes bitmap that's held by the parent table's relcache entry, as the caller functions do. This is more consistent and likely a bit faster, since it avoids a cache lookup. Hence, change to doing it that way. While at it, rather than blithely assuming that the identity columns are non-null (with catastrophic results if that's wrong), add assertion checks that they aren't null. Possibly those should be actual test-and-elog, but I'll leave it like this for now. In principle, this is a bug that's been there since this code was introduced (in 9.4). In practice, the risk seems quite low, since we do have a lock on the index's parent table, so concurrent changes to the index's catalog entries seem unlikely. Given the precedent that commit 9c703c169 wasn't back-patched, I won't risk back-patching this further than v12. Per report from Hadi Moshayedi. Discussion: https://postgr.es/m/CAK=1=Wrek44Ese1V7LjKiQS-Nd-5LgLi_5_CskGbpggKEf3tKQ@mail.gmail.com
2019-09-02 22:10:37 +02:00
key_tuple = toast_flatten_tuple(oldtup, desc);
heap_freetuple(oldtup);
}
return key_tuple;
}
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
* Handles XLOG_HEAP2_PRUNE record type.
*
Standardize cleanup lock terminology. The term "super-exclusive lock" is a synonym for "buffer cleanup lock" that first appeared in nbtree many years ago. Standardize things by consistently using the term cleanup lock. This finishes work started by commit 276db875. There is no good reason to have two terms. But there is a good reason to only have one: to avoid confusion around why VACUUM acquires a full cleanup lock (not just an ordinary exclusive lock) in index AMs, during ambulkdelete calls. This has nothing to do with protecting the physical index data structure itself. It is needed to implement a locking protocol that ensures that TIDs pointing to the heap/table structure cannot get marked for recycling by VACUUM before it is safe (which is somewhat similar to how VACUUM uses cleanup locks during its first heap pass). Note that it isn't strictly necessary for index AMs to implement this locking protocol -- several index AMs use an MVCC snapshot as their sole interlock to prevent unsafe TID recycling. In passing, update the nbtree README. Cleanly separate discussion of the aforementioned index vacuuming locking protocol from discussion of the "drop leaf page pin" optimization added by commit 2ed5b87f. We now structure discussion of the latter by describing how individual index scans may safely opt out of applying the standard locking protocol (and so can avoid blocking progress by VACUUM). Also document why the optimization is not safe to apply during nbtree index-only scans. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-WzngHgQa92tz6NQihf4nxJwRzCV36yMJO_i8dS+2mgEVKw@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkHPgsBBvGWjz=8PjNhDefy7XRkDKiT5NxMs-n5ZCf2dA@mail.gmail.com
2021-12-09 02:24:45 +01:00
* Acquires a full cleanup lock.
*/
2000-10-29 19:33:41 +01:00
static void
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
heap_xlog_prune(XLogReaderState *record)
2000-10-29 19:33:41 +01:00
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
xl_heap_prune *xlrec = (xl_heap_prune *) XLogRecGetData(record);
Buffer buffer;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
BlockNumber blkno;
XLogRedoAction action;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* We're about to remove tuples. In Hot Standby mode, ensure that there's
* no queries running for which the removed tuples are still visible.
*/
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
if (InHotStandby)
ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon,
Handle logical slot conflicts on standby During WAL replay on the standby, when a conflict with a logical slot is identified, invalidate such slots. There are two sources of conflicts: 1) Using the information added in 6af1793954e, logical slots are invalidated if required rows are removed 2) wal_level on the primary server is reduced to below logical Uses the infrastructure introduced in the prior commit. FIXME: add commit reference. Change InvalidatePossiblyObsoleteSlot() to use a recovery conflict to interrupt use of a slot, if called in the startup process. The new recovery conflict is added to pg_stat_database_conflicts, as confl_active_logicalslot. See 6af1793954e for an overall design of logical decoding on a standby. Bumps catversion for the addition of the pg_stat_database_conflicts column. Bumps PGSTAT_FILE_FORMAT_ID for the same reason. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Reviewed-by: Alvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20230407075009.igg7be27ha2htkbt@awork3.anarazel.de
2023-04-08 08:11:28 +02:00
xlrec->isCatalogRel,
rlocator);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
/*
* If we have a full-page image, restore it (using a cleanup lock) and
* we're done.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
&buffer);
if (action == BLK_NEEDS_REDO)
{
Page page = (Page) BufferGetPage(buffer);
OffsetNumber *end;
OffsetNumber *redirected;
OffsetNumber *nowdead;
OffsetNumber *nowunused;
int nredirected;
int ndead;
int nunused;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
Size datalen;
redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
nredirected = xlrec->nredirected;
ndead = xlrec->ndead;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
end = (OffsetNumber *) ((char *) redirected + datalen);
nowdead = redirected + (nredirected * 2);
nowunused = nowdead + ndead;
nunused = (end - nowunused);
Assert(nunused >= 0);
/* Update all line pointers per the record, and repair fragmentation */
heap_page_prune_execute(buffer,
redirected, nredirected,
nowdead, ndead,
nowunused, nunused);
/*
* Note: we don't worry about updating the page's prunability hints.
* At worst this will cause an extra prune cycle to occur soon.
*/
2000-10-29 19:33:41 +01:00
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
if (BufferIsValid(buffer))
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
{
Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
UnlockReleaseBuffer(buffer);
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
/*
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
* After pruning records from a page, it's useful to update the FSM
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
* about it, as it may cause the page become target for insertions
* later even if vacuum decides not to visit it (which is possible if
* gets marked all-visible.)
*
* Do this regardless of a full-page image being applied, since the
* FSM data is not in the page anyway.
*/
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
}
2000-10-29 19:33:41 +01:00
}
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
/*
* Handles XLOG_HEAP2_VACUUM record type.
*
Standardize cleanup lock terminology. The term "super-exclusive lock" is a synonym for "buffer cleanup lock" that first appeared in nbtree many years ago. Standardize things by consistently using the term cleanup lock. This finishes work started by commit 276db875. There is no good reason to have two terms. But there is a good reason to only have one: to avoid confusion around why VACUUM acquires a full cleanup lock (not just an ordinary exclusive lock) in index AMs, during ambulkdelete calls. This has nothing to do with protecting the physical index data structure itself. It is needed to implement a locking protocol that ensures that TIDs pointing to the heap/table structure cannot get marked for recycling by VACUUM before it is safe (which is somewhat similar to how VACUUM uses cleanup locks during its first heap pass). Note that it isn't strictly necessary for index AMs to implement this locking protocol -- several index AMs use an MVCC snapshot as their sole interlock to prevent unsafe TID recycling. In passing, update the nbtree README. Cleanly separate discussion of the aforementioned index vacuuming locking protocol from discussion of the "drop leaf page pin" optimization added by commit 2ed5b87f. We now structure discussion of the latter by describing how individual index scans may safely opt out of applying the standard locking protocol (and so can avoid blocking progress by VACUUM). Also document why the optimization is not safe to apply during nbtree index-only scans. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-WzngHgQa92tz6NQihf4nxJwRzCV36yMJO_i8dS+2mgEVKw@mail.gmail.com Discussion: https://postgr.es/m/CAH2-WzkHPgsBBvGWjz=8PjNhDefy7XRkDKiT5NxMs-n5ZCf2dA@mail.gmail.com
2021-12-09 02:24:45 +01:00
* Acquires an ordinary exclusive lock only.
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
*/
static void
heap_xlog_vacuum(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_vacuum *xlrec = (xl_heap_vacuum *) XLogRecGetData(record);
Buffer buffer;
BlockNumber blkno;
XLogRedoAction action;
/*
* If we have a full-page image, restore it (without using a cleanup lock)
* and we're done.
*/
action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false,
&buffer);
if (action == BLK_NEEDS_REDO)
{
Page page = (Page) BufferGetPage(buffer);
OffsetNumber *nowunused;
Size datalen;
OffsetNumber *offnum;
nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
/* Shouldn't be a record unless there's something to do */
Assert(xlrec->nunused > 0);
/* Update all now-unused line pointers */
offnum = nowunused;
for (int i = 0; i < xlrec->nunused; i++)
{
OffsetNumber off = *offnum++;
ItemId lp = PageGetItemId(page, off);
Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
ItemIdSetUnused(lp);
}
/* Attempt to truncate line pointer array now */
PageTruncateLinePointerArray(page);
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
{
Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
UnlockReleaseBuffer(buffer);
/*
* After vacuuming LP_DEAD items from a page, it's useful to update
* the FSM about it, as it may cause the page become target for
* insertions later even if vacuum decides not to visit it (which is
* possible if gets marked all-visible.)
*
* Do this regardless of a full-page image being applied, since the
* FSM data is not in the page anyway.
*/
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
}
}
/*
* Replay XLOG_HEAP2_VISIBLE record.
*
* The critical integrity requirement here is that we must never end up with
* a situation where the visibility map bit is set, and the page-level
* PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
* page modification would fail to clear the visibility map bit.
*/
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_visible(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
Buffer vmbuffer = InvalidBuffer;
Buffer buffer;
Page page;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
BlockNumber blkno;
XLogRedoAction action;
Add info in WAL records in preparation for logical slot conflict handling This commit only implements one prerequisite part for allowing logical decoding. The commit message contains an explanation of the overall design, which later commits will refer back to. Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment. It's not needed on the others struct where isCatalogRel has been added. This commit just introduces the WAL format changes mentioned above. Handling the actual conflicts will follow in future commits. Bumps XLOG_PAGE_MAGIC as the several WAL records are changed. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> (in an older version) Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2023-04-02 21:32:19 +02:00
Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags);
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno);
/*
* If there are any Hot Standby transactions running that have an xmin
* horizon old enough that this page isn't all-visible for them, they
* might incorrectly decide that an index-only scan can skip a heap fetch.
*
* NB: It might be better to throw some kind of "soft" conflict here that
* forces any index-only scan that is in flight to perform heap fetches,
* rather than killing the transaction outright.
*/
if (InHotStandby)
ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon,
Handle logical slot conflicts on standby During WAL replay on the standby, when a conflict with a logical slot is identified, invalidate such slots. There are two sources of conflicts: 1) Using the information added in 6af1793954e, logical slots are invalidated if required rows are removed 2) wal_level on the primary server is reduced to below logical Uses the infrastructure introduced in the prior commit. FIXME: add commit reference. Change InvalidatePossiblyObsoleteSlot() to use a recovery conflict to interrupt use of a slot, if called in the startup process. The new recovery conflict is added to pg_stat_database_conflicts, as confl_active_logicalslot. See 6af1793954e for an overall design of logical decoding on a standby. Bumps catversion for the addition of the pg_stat_database_conflicts column. Bumps PGSTAT_FILE_FORMAT_ID for the same reason. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Reviewed-by: Alvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20230407075009.igg7be27ha2htkbt@awork3.anarazel.de
2023-04-08 08:11:28 +02:00
xlrec->flags & VISIBILITYMAP_XLOG_CATALOG_REL,
rlocator);
/*
* Read the heap page, if it still exists. If the heap file has dropped or
* truncated later in recovery, we don't need to update the page, but we'd
* better still update the visibility map.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
action = XLogReadBufferForRedo(record, 1, &buffer);
if (action == BLK_NEEDS_REDO)
{
/*
* We don't bump the LSN of the heap page when setting the visibility
* map bit (unless checksums or wal_hint_bits is enabled, in which
* case we must). This exposes us to torn page hazards, but since
* we're not inspecting the existing page contents in any way, we
* don't care.
*/
page = BufferGetPage(buffer);
PageSetAllVisible(page);
if (XLogHintBitIsNeeded())
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
else if (action == BLK_RESTORED)
{
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
/*
* If heap block was backed up, we already restored it and there's
* nothing more to do. (This can only happen with checksums or
* wal_log_hints enabled.)
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
*/
}
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
if (BufferIsValid(buffer))
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
{
Size space = PageGetFreeSpace(BufferGetPage(buffer));
UnlockReleaseBuffer(buffer);
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
/*
* Since FSM is not WAL-logged and only updated heuristically, it
* easily becomes stale in standbys. If the standby is later promoted
* and runs VACUUM, it will skip updating individual free space
* figures for pages that became all-visible (or all-frozen, depending
* on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
* propagates too optimistic free space values to upper FSM layers;
* later inserters try to use such pages only to find out that they
* are unusable. This can cause long stalls when there are many such
* pages.
*
* Forestall those problems by updating FSM's idea about a page that
* is becoming all-visible or all-frozen.
*
* Do this regardless of a full-page image being applied, since the
* FSM data is not in the page anyway.
*/
if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecordPageWithFreeSpace(rlocator, blkno, space);
Update FSM on WAL replay of page all-visible/frozen We aren't very strict about keeping FSM up to date on WAL replay, because per-page freespace values aren't critical in replicas (can't write to heap in a replica; and if the replica is promoted, the values would be updated by VACUUM anyway). However, VACUUM since 9.6 can skip processing pages marked all-visible or all-frozen, and if such pages are recorded in FSM with wrong values, those values are blindly propagated to FSM's upper layers by VACUUM's FreeSpaceMapVacuum. (This rationale assumes that crashes are not very frequent, because those would cause outdated FSM to occur in the primary.) Even when the FSM is outdated in standby, things are not too bad normally, because, most per-page FSM values will be zero (other than those propagated with the base-backup that created the standby); only once the remaining free space is less than 0.2*BLCKSZ the per-page value is maintained by WAL replay of heap ins/upd/del. However, if wal_log_hints=on causes complete FSM pages to be propagated to a standby via full-page images, many too-optimistic per-page values can end up being registered in the standby. Incorrect per-page values aren't critical in most cases, since an inserter that is given a page that doesn't actually contain the claimed free space will update FSM with the correct value, and retry until it finds a usable page. However, if there are many such updates to do, an inserter can spend a long time doing them before a usable page is found; in a heavily trafficked insert-only table with many concurrent inserters this has been observed to cause several second stalls, causing visible application malfunction. To fix this problem, it seems sufficient to have heap_xlog_visible (replay of setting all-visible and all-frozen VM bits for a heap page) update the FSM value for the page being processed. This fixes the per-page counters together with making the page skippable to vacuum, so when vacuum does FreeSpaceMapVacuum, the values propagated to FSM upper layers are the correct ones, avoiding the problem. While at it, apply the same fix to heap_xlog_clean (replay of tuple removal by HOT pruning and vacuum). This makes any space freed by the cleaning available earlier than the next vacuum in the promoted replica. Backpatch to 9.6, where this problem was diagnosed on an insert-only table with all-frozen pages, which were introduced as a concept in that release. Theoretically it could apply with all-visible pages to older branches, but there's been no report of that and it doesn't backpatch cleanly anyway. Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20180802172857.5skoexsilnjvgruk@alvherre.pgsql
2018-08-15 23:09:29 +02:00
}
/*
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
* Even if we skipped the heap page update due to the LSN interlock, it's
* still safe to update the visibility map. Any WAL record that clears
* the visibility map bit does so before checking the page LSN, so any
* bits that need to be cleared will still be cleared.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
&vmbuffer) == BLK_NEEDS_REDO)
{
Page vmpage = BufferGetPage(vmbuffer);
Relation reln;
Add info in WAL records in preparation for logical slot conflict handling This commit only implements one prerequisite part for allowing logical decoding. The commit message contains an explanation of the overall design, which later commits will refer back to. Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment. It's not needed on the others struct where isCatalogRel has been added. This commit just introduces the WAL format changes mentioned above. Handling the actual conflicts will follow in future commits. Bumps XLOG_PAGE_MAGIC as the several WAL records are changed. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> (in an older version) Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2023-04-02 21:32:19 +02:00
uint8 vmbits;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/* initialize the page if it was read as zeros */
if (PageIsNew(vmpage))
PageInit(vmpage, BLCKSZ, 0);
Add info in WAL records in preparation for logical slot conflict handling This commit only implements one prerequisite part for allowing logical decoding. The commit message contains an explanation of the overall design, which later commits will refer back to. Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment. It's not needed on the others struct where isCatalogRel has been added. This commit just introduces the WAL format changes mentioned above. Handling the actual conflicts will follow in future commits. Bumps XLOG_PAGE_MAGIC as the several WAL records are changed. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> (in an older version) Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2023-04-02 21:32:19 +02:00
/* remove VISIBILITYMAP_XLOG_* */
vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/*
* XLogReadBufferForRedoExtended locked the buffer. But
* visibilitymap_set will handle locking itself.
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
*/
LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
reln = CreateFakeRelcacheEntry(rlocator);
visibilitymap_pin(reln, blkno, &vmbuffer);
visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
Add info in WAL records in preparation for logical slot conflict handling This commit only implements one prerequisite part for allowing logical decoding. The commit message contains an explanation of the overall design, which later commits will refer back to. Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment. It's not needed on the others struct where isCatalogRel has been added. This commit just introduces the WAL format changes mentioned above. Handling the actual conflicts will follow in future commits. Bumps XLOG_PAGE_MAGIC as the several WAL records are changed. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> (in an older version) Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2023-04-02 21:32:19 +02:00
xlrec->snapshotConflictHorizon, vmbits);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
else if (BufferIsValid(vmbuffer))
UnlockReleaseBuffer(vmbuffer);
}
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
/*
* Replay XLOG_HEAP2_FREEZE_PAGE records
*/
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_freeze_page(XLogReaderState *record)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
Buffer buffer;
/*
* In Hot Standby mode, ensure that there's no queries running which still
* consider the frozen xids as running.
*/
if (InHotStandby)
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL);
ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon,
Handle logical slot conflicts on standby During WAL replay on the standby, when a conflict with a logical slot is identified, invalidate such slots. There are two sources of conflicts: 1) Using the information added in 6af1793954e, logical slots are invalidated if required rows are removed 2) wal_level on the primary server is reduced to below logical Uses the infrastructure introduced in the prior commit. FIXME: add commit reference. Change InvalidatePossiblyObsoleteSlot() to use a recovery conflict to interrupt use of a slot, if called in the startup process. The new recovery conflict is added to pg_stat_database_conflicts, as confl_active_logicalslot. See 6af1793954e for an overall design of logical decoding on a standby. Bumps catversion for the addition of the pg_stat_database_conflicts column. Bumps PGSTAT_FILE_FORMAT_ID for the same reason. Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version) Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com> Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Reviewed-by: Amit Kapila <amit.kapila16@gmail.com> Reviewed-by: Alvaro Herrera <alvherre@alvh.no-ip.org> Discussion: https://postgr.es/m/20230407075009.igg7be27ha2htkbt@awork3.anarazel.de
2023-04-08 08:11:28 +02:00
xlrec->isCatalogRel,
rlocator);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
}
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
{
Page page = BufferGetPage(buffer);
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
xl_heap_freeze_plan *plans;
OffsetNumber *offsets;
int curoff = 0;
plans = (xl_heap_freeze_plan *) XLogRecGetBlockData(record, 0, NULL);
offsets = (OffsetNumber *) ((char *) plans +
(xlrec->nplans *
sizeof(xl_heap_freeze_plan)));
for (int p = 0; p < xlrec->nplans; p++)
{
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
HeapTupleFreeze frz;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
/*
* Convert freeze plan representation from WAL record into
* per-tuple format used by heap_execute_freeze_tuple
*/
frz.xmax = plans[p].xmax;
frz.t_infomask2 = plans[p].t_infomask2;
frz.t_infomask = plans[p].t_infomask;
frz.frzflags = plans[p].frzflags;
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
frz.offset = InvalidOffsetNumber; /* unused, but be tidy */
for (int i = 0; i < plans[p].ntuples; i++)
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
{
OffsetNumber offset = offsets[curoff++];
ItemId lp;
HeapTupleHeader tuple;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
Deduplicate freeze plans in freeze WAL records. Make heapam WAL records that describe freezing performed by VACUUM more space efficient by storing each distinct "freeze plan" once, alongside an array of associated page offset numbers (one per freeze plan). The freeze plans required for most heap pages tend to naturally have a great deal of redundancy, so this technique is very effective in practice. It often leads to freeze WAL records that are less than 20% of the size of equivalent WAL records generated using the previous approach. The freeze plan concept was introduced by commit 3b97e6823b, which fixed bugs in VACUUM's handling of MultiXacts. We retain the concept of freeze plans, but go back to using page offset number arrays. There is no loss of generality here because deduplication is an additive process that gets applied mechanically when FREEZE_PAGE WAL records are built. More than anything else, freeze plan deduplication is an optimization that reduces the marginal cost of freezing additional tuples on pages that will need to have at least one or two tuples frozen in any case. Ongoing work that adds page-level freezing to VACUUM will take full advantage of the improved cost profile through batching. Also refactor some of the details surrounding recovery conflicts needed to REDO freeze records in passing: make original execution responsible for generating a standard latestRemovedXid cutoff, rather than working backwards to get the same cutoff in the REDO routine. Bugfix commit 66fbcb0d2e did it the other way around, which is equivalent but obscures what's going on. Also rename the cutoff field from the WAL record/struct (rename the field cutoff_xid to latestRemovedXid to match similar WAL records). Processing of conflicts by REDO routines is already completely uniform, so tools like pg_waldump should present the information driving the process uniformly. There are two remaining WAL record types that still don't quite follow this convention (heapam's VISIBLE record type and SP-GiST's VACUUM_REDIRECT record type). They can be brought into line by later work that totally standardizes how the cutoffs are presented. Bump XLOG_PAGE_MAGIC. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Nathan Bossart <nathandbossart@gmail.com> Reviewed-By: Justin Pryzby <pryzby@telsasoft.com> Discussion: https://postgr.es/m/CAH2-Wz=XytErMnb8FAyFd+OQEbiipB0Q2FmFdXrggPL4VBnRYQ@mail.gmail.com
2022-11-15 16:48:41 +01:00
lp = PageGetItemId(page, offset);
tuple = (HeapTupleHeader) PageGetItem(page, lp);
heap_execute_freeze_tuple(tuple, &frz);
}
}
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
/*
* Given an "infobits" field from an XLog record, set the correct bits in the
* given infomask and infomask2 for the tuple touched by the record.
*
* (This is the reverse of compute_infobits).
*/
static void
fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
{
*infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
*infomask2 &= ~HEAP_KEYS_UPDATED;
if (infobits & XLHL_XMAX_IS_MULTI)
*infomask |= HEAP_XMAX_IS_MULTI;
if (infobits & XLHL_XMAX_LOCK_ONLY)
*infomask |= HEAP_XMAX_LOCK_ONLY;
if (infobits & XLHL_XMAX_EXCL_LOCK)
*infomask |= HEAP_XMAX_EXCL_LOCK;
/* note HEAP_XMAX_SHR_LOCK isn't considered here */
if (infobits & XLHL_XMAX_KEYSHR_LOCK)
*infomask |= HEAP_XMAX_KEYSHR_LOCK;
if (infobits & XLHL_KEYS_UPDATED)
*infomask2 |= HEAP_KEYS_UPDATED;
}
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_delete(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
Buffer buffer;
Page page;
2000-11-20 22:14:13 +01:00
ItemId lp = NULL;
HeapTupleHeader htup;
BlockNumber blkno;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator target_locator;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
ItemPointerData target_tid;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
ItemPointerSetBlockNumber(&target_tid, blkno);
ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
Relation reln = CreateFakeRelcacheEntry(target_locator);
Buffer vmbuffer = InvalidBuffer;
visibilitymap_pin(reln, blkno, &vmbuffer);
visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
{
page = BufferGetPage(buffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
lp = PageGetItemId(page, xlrec->offnum);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
elog(PANIC, "invalid lp");
htup = (HeapTupleHeader) PageGetItem(page, lp);
2000-10-29 19:33:41 +01:00
htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
HeapTupleHeaderClearHotUpdated(htup);
fix_infomask_from_infobits(xlrec->infobits_set,
&htup->t_infomask, &htup->t_infomask2);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
HeapTupleHeaderSetXmax(htup, xlrec->xmax);
else
HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
/* Mark the page as a candidate for pruning */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
PageSetPrunable(page, XLogRecGetXid(record));
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
2018-04-07 22:24:10 +02:00
/* Make sure t_ctid is set correctly */
if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
HeapTupleHeaderSetMovedPartitions(htup);
else
htup->t_ctid = target_tid;
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
}
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_insert(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
Buffer buffer;
Page page;
union
{
HeapTupleHeaderData hdr;
char data[MaxHeapTupleSize];
} tbuf;
HeapTupleHeader htup;
xl_heap_header xlhdr;
uint32 newlen;
Size freespace = 0;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator target_locator;
BlockNumber blkno;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
ItemPointerData target_tid;
XLogRedoAction action;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
ItemPointerSetBlockNumber(&target_tid, blkno);
ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
Relation reln = CreateFakeRelcacheEntry(target_locator);
Buffer vmbuffer = InvalidBuffer;
visibilitymap_pin(reln, blkno, &vmbuffer);
visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
/*
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
* If we inserted the first and only tuple on the page, re-initialize the
* page from scratch.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
buffer = XLogInitBufferForRedo(record, 0);
page = BufferGetPage(buffer);
PageInit(page, BufferGetPageSize(buffer), 0);
action = BLK_NEEDS_REDO;
}
else
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
action = XLogReadBufferForRedo(record, 0, &buffer);
if (action == BLK_NEEDS_REDO)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
Size datalen;
char *data;
page = BufferGetPage(buffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
elog(PANIC, "invalid max offset number");
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
data = XLogRecGetBlockData(record, 0, &datalen);
newlen = datalen - SizeOfHeapHeader;
Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
data += SizeOfHeapHeader;
htup = &tbuf.hdr;
MemSet((char *) htup, 0, SizeofHeapTupleHeader);
/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
memcpy((char *) htup + SizeofHeapTupleHeader,
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
data,
newlen);
newlen += SizeofHeapTupleHeader;
htup->t_infomask2 = xlhdr.t_infomask2;
htup->t_infomask = xlhdr.t_infomask;
htup->t_hoff = xlhdr.t_hoff;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
HeapTupleHeaderSetCmin(htup, FirstCommandId);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
htup->t_ctid = target_tid;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
true, true) == InvalidOffsetNumber)
elog(PANIC, "failed to add tuple");
freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
PageSetLSN(page, lsn);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
PageSetAllVisible(page);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
/*
* If the page is running low on free space, update the FSM as well.
* Arbitrarily, our definition of "low" is less than 20%. We can't do much
* better than that without knowing the fill-factor for the table.
*
* XXX: Don't do this if the page was restored from full page image. We
* don't bother to update the FSM in that case, it doesn't need to be
* totally accurate anyway.
*/
if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecordPageWithFreeSpace(target_locator, blkno, freespace);
}
/*
* Handles MULTI_INSERT record type.
*/
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_multi_insert(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_multi_insert *xlrec;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
BlockNumber blkno;
Buffer buffer;
Page page;
union
{
HeapTupleHeaderData hdr;
char data[MaxHeapTupleSize];
} tbuf;
HeapTupleHeader htup;
uint32 newlen;
Size freespace = 0;
int i;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
bool isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
XLogRedoAction action;
/*
* Insertion doesn't overwrite MVCC data, so no conflict processing is
* required.
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
/* check that the mutually exclusive flags are not both set */
Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) &&
(xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)));
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
Relation reln = CreateFakeRelcacheEntry(rlocator);
Buffer vmbuffer = InvalidBuffer;
visibilitymap_pin(reln, blkno, &vmbuffer);
visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
if (isinit)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
buffer = XLogInitBufferForRedo(record, 0);
page = BufferGetPage(buffer);
PageInit(page, BufferGetPageSize(buffer), 0);
action = BLK_NEEDS_REDO;
}
else
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
action = XLogReadBufferForRedo(record, 0, &buffer);
if (action == BLK_NEEDS_REDO)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
char *tupdata;
char *endptr;
Size len;
/* Tuples are stored as block data */
tupdata = XLogRecGetBlockData(record, 0, &len);
endptr = tupdata + len;
page = (Page) BufferGetPage(buffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
for (i = 0; i < xlrec->ntuples; i++)
{
OffsetNumber offnum;
xl_multi_insert_tuple *xlhdr;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
/*
* If we're reinitializing the page, the tuples are stored in
* order from FirstOffsetNumber. Otherwise there's an array of
* offsets in the WAL record, and the tuples come after that.
*/
if (isinit)
offnum = FirstOffsetNumber + i;
else
offnum = xlrec->offsets[i];
if (PageGetMaxOffsetNumber(page) + 1 < offnum)
elog(PANIC, "invalid max offset number");
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
newlen = xlhdr->datalen;
Assert(newlen <= MaxHeapTupleSize);
htup = &tbuf.hdr;
MemSet((char *) htup, 0, SizeofHeapTupleHeader);
/* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
memcpy((char *) htup + SizeofHeapTupleHeader,
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
(char *) tupdata,
newlen);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
tupdata += newlen;
newlen += SizeofHeapTupleHeader;
htup->t_infomask2 = xlhdr->t_infomask2;
htup->t_infomask = xlhdr->t_infomask;
htup->t_hoff = xlhdr->t_hoff;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
HeapTupleHeaderSetCmin(htup, FirstCommandId);
ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "failed to add tuple");
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (tupdata != endptr)
elog(PANIC, "total tuple length mismatch");
freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
PageSetLSN(page, lsn);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
/* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
PageSetAllVisible(page);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
/*
* If the page is running low on free space, update the FSM as well.
* Arbitrarily, our definition of "low" is less than 20%. We can't do much
* better than that without knowing the fill-factor for the table.
*
* XXX: Don't do this if the page was restored from full page image. We
* don't bother to update the FSM in that case, it doesn't need to be
* totally accurate anyway.
*/
if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
}
2000-10-24 11:56:23 +02:00
/*
* Handles UPDATE and HOT_UPDATE
2000-10-24 11:56:23 +02:00
*/
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_update(XLogReaderState *record, bool hot_update)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
BlockNumber oldblk;
BlockNumber newblk;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
ItemPointerData newtid;
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
Buffer obuffer,
nbuffer;
Page page;
OffsetNumber offnum;
2000-11-20 22:14:13 +01:00
ItemId lp = NULL;
HeapTupleData oldtup;
HeapTupleHeader htup;
uint16 prefixlen = 0,
suffixlen = 0;
char *newp;
union
{
HeapTupleHeaderData hdr;
char data[MaxHeapTupleSize];
} tbuf;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xl_heap_header xlhdr;
uint32 newlen;
Size freespace = 0;
XLogRedoAction oldaction;
XLogRedoAction newaction;
/* initialize to keep the compiler quiet */
oldtup.t_data = NULL;
oldtup.t_len = 0;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk);
if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL))
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
{
/* HOT updates are never done across pages */
Assert(!hot_update);
}
else
oldblk = newblk;
ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
Relation reln = CreateFakeRelcacheEntry(rlocator);
Buffer vmbuffer = InvalidBuffer;
visibilitymap_pin(reln, oldblk, &vmbuffer);
visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
/*
* In normal operation, it is important to lock the two pages in
* page-number order, to avoid possible deadlocks against other update
* operations going the other way. However, during WAL replay there can
* be no other update happening, so we don't need to worry about that. But
* we *do* need to worry that we don't expose an inconsistent state to Hot
* Standby queries --- so the original page can't be unlocked before we've
* added the new tuple to the new page.
*/
/* Deal with old tuple version */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
&obuffer);
if (oldaction == BLK_NEEDS_REDO)
{
page = BufferGetPage(obuffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
offnum = xlrec->old_offnum;
if (PageGetMaxOffsetNumber(page) >= offnum)
lp = PageGetItemId(page, offnum);
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(PANIC, "invalid lp");
htup = (HeapTupleHeader) PageGetItem(page, lp);
oldtup.t_data = htup;
oldtup.t_len = ItemIdGetLength(lp);
2006-10-04 02:30:14 +02:00
htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
if (hot_update)
HeapTupleHeaderSetHotUpdated(htup);
else
HeapTupleHeaderClearHotUpdated(htup);
fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
&htup->t_infomask2);
HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
/* Set forward chain link in t_ctid */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
htup->t_ctid = newtid;
/* Mark the page as a candidate for pruning */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
PageSetPrunable(page, XLogRecGetXid(record));
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
PageSetLSN(page, lsn);
MarkBufferDirty(obuffer);
}
/*
* Read the page the new tuple goes into, if different from old.
*/
if (oldblk == newblk)
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
{
nbuffer = obuffer;
newaction = oldaction;
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
nbuffer = XLogInitBufferForRedo(record, 0);
page = (Page) BufferGetPage(nbuffer);
PageInit(page, BufferGetPageSize(nbuffer), 0);
newaction = BLK_NEEDS_REDO;
}
else
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
Relation reln = CreateFakeRelcacheEntry(rlocator);
Buffer vmbuffer = InvalidBuffer;
visibilitymap_pin(reln, newblk, &vmbuffer);
visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
/* Deal with new tuple */
if (newaction == BLK_NEEDS_REDO)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
char *recdata;
char *recdata_end;
Size datalen;
Size tuplen;
recdata = XLogRecGetBlockData(record, 0, &datalen);
recdata_end = recdata + datalen;
2001-03-22 05:01:46 +01:00
page = BufferGetPage(nbuffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
offnum = xlrec->new_offnum;
if (PageGetMaxOffsetNumber(page) + 1 < offnum)
elog(PANIC, "invalid max offset number");
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
{
Assert(newblk == oldblk);
memcpy(&prefixlen, recdata, sizeof(uint16));
recdata += sizeof(uint16);
}
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
{
Assert(newblk == oldblk);
memcpy(&suffixlen, recdata, sizeof(uint16));
recdata += sizeof(uint16);
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
recdata += SizeOfHeapHeader;
tuplen = recdata_end - recdata;
Assert(tuplen <= MaxHeapTupleSize);
htup = &tbuf.hdr;
MemSet((char *) htup, 0, SizeofHeapTupleHeader);
/*
* Reconstruct the new tuple using the prefix and/or suffix from the
* old tuple, and the data stored in the WAL record.
*/
newp = (char *) htup + SizeofHeapTupleHeader;
if (prefixlen > 0)
{
int len;
/* copy bitmap [+ padding] [+ oid] from WAL record */
len = xlhdr.t_hoff - SizeofHeapTupleHeader;
memcpy(newp, recdata, len);
recdata += len;
newp += len;
/* copy prefix from old tuple */
memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
newp += prefixlen;
/* copy new tuple data from WAL record */
len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
memcpy(newp, recdata, len);
recdata += len;
newp += len;
}
else
{
/*
* copy bitmap [+ padding] [+ oid] + data from record, all in one
* go
*/
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
memcpy(newp, recdata, tuplen);
recdata += tuplen;
newp += tuplen;
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
Assert(recdata == recdata_end);
/* copy suffix from old tuple */
if (suffixlen > 0)
memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
htup->t_infomask2 = xlhdr.t_infomask2;
htup->t_infomask = xlhdr.t_infomask;
htup->t_hoff = xlhdr.t_hoff;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
HeapTupleHeaderSetCmin(htup, FirstCommandId);
HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
/* Make sure there is no forward chain link in t_ctid */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
htup->t_ctid = newtid;
offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "failed to add tuple");
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
PageClearAllVisible(page);
freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
PageSetLSN(page, lsn);
MarkBufferDirty(nbuffer);
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (BufferIsValid(nbuffer) && nbuffer != obuffer)
UnlockReleaseBuffer(nbuffer);
if (BufferIsValid(obuffer))
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
UnlockReleaseBuffer(obuffer);
/*
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
* If the new page is running low on free space, update the FSM as well.
* Arbitrarily, our definition of "low" is less than 20%. We can't do much
* better than that without knowing the fill-factor for the table.
*
* However, don't update the FSM on HOT updates, because after crash
* recovery, either the old or the new tuple will certainly be dead and
* prunable. After pruning, the page will have roughly as much free space
* as it did before the update, assuming the new tuple is about the same
* size as the old one.
*
* XXX: Don't do this if the page was restored from full page image. We
* don't bother to update the FSM in that case, it doesn't need to be
* totally accurate anyway.
*/
if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecordPageWithFreeSpace(rlocator, newblk, freespace);
}
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
static void
heap_xlog_confirm(XLogReaderState *record)
{
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
Buffer buffer;
Page page;
OffsetNumber offnum;
ItemId lp = NULL;
HeapTupleHeader htup;
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
{
page = BufferGetPage(buffer);
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
offnum = xlrec->offnum;
if (PageGetMaxOffsetNumber(page) >= offnum)
lp = PageGetItemId(page, offnum);
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(PANIC, "invalid lp");
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
htup = (HeapTupleHeader) PageGetItem(page, lp);
/*
* Confirm tuple as actually inserted
*/
ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
}
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_lock(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
Buffer buffer;
Page page;
OffsetNumber offnum;
ItemId lp = NULL;
HeapTupleHeader htup;
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
Buffer vmbuffer = InvalidBuffer;
BlockNumber block;
Relation reln;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block);
reln = CreateFakeRelcacheEntry(rlocator);
visibilitymap_pin(reln, block, &vmbuffer);
visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
{
page = (Page) BufferGetPage(buffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
offnum = xlrec->offnum;
if (PageGetMaxOffsetNumber(page) >= offnum)
lp = PageGetItemId(page, offnum);
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(PANIC, "invalid lp");
htup = (HeapTupleHeader) PageGetItem(page, lp);
htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
&htup->t_infomask2);
/*
* Clear relevant update flags, but only if the modified infomask says
* there's no update.
*/
if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
{
HeapTupleHeaderClearHotUpdated(htup);
/* Make sure there is no forward chain link in t_ctid */
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
ItemPointerSet(&htup->t_ctid,
BufferGetBlockNumber(buffer),
offnum);
}
HeapTupleHeaderSetXmax(htup, xlrec->xmax);
HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
}
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_lock_updated(XLogReaderState *record)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_lock_updated *xlrec;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
Buffer buffer;
Page page;
OffsetNumber offnum;
ItemId lp = NULL;
HeapTupleHeader htup;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
/*
* The visibility map may need to be fixed even if the heap page is
* already up-to-date.
*/
if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
{
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
RelFileLocator rlocator;
Buffer vmbuffer = InvalidBuffer;
BlockNumber block;
Relation reln;
Change internal RelFileNode references to RelFileNumber or RelFileLocator. We have been using the term RelFileNode to refer to either (1) the integer that is used to name the sequence of files for a certain relation within the directory set aside for that tablespace/database combination; or (2) that value plus the OIDs of the tablespace and database; or occasionally (3) the whole series of files created for a relation based on those values. Using the same name for more than one thing is confusing. Replace RelFileNode with RelFileNumber when we're talking about just the single number, i.e. (1) from above, and with RelFileLocator when we're talking about all the things that are needed to locate a relation's files on disk, i.e. (2) from above. In the places where we refer to (3) as a relfilenode, instead refer to "relation storage". Since there is a ton of SQL code in the world that knows about pg_class.relfilenode, don't change the name of that column, or of other SQL-facing things that derive their name from it. On the other hand, do adjust closely-related internal terminology. For example, the structure member names dbNode and spcNode appear to be derived from the fact that the structure itself was called RelFileNode, so change those to dbOid and spcOid. Likewise, various variables with names like rnode and relnode get renamed appropriately, according to how they're being used in context. Hopefully, this is clearer than before. It is also preparation for future patches that intend to widen the relfilenumber fields from its current width of 32 bits. Variables that store a relfilenumber are now declared as type RelFileNumber rather than type Oid; right now, these are the same, but that can now more easily be changed. Dilip Kumar, per an idea from me. Reviewed also by Andres Freund. I fixed some whitespace issues, changed a couple of words in a comment, and made one other minor correction. Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
2022-07-06 17:39:09 +02:00
XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block);
reln = CreateFakeRelcacheEntry(rlocator);
visibilitymap_pin(reln, block, &vmbuffer);
visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
ReleaseBuffer(vmbuffer);
FreeFakeRelcacheEntry(reln);
}
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
{
page = BufferGetPage(buffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
offnum = xlrec->offnum;
if (PageGetMaxOffsetNumber(page) >= offnum)
lp = PageGetItemId(page, offnum);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(PANIC, "invalid lp");
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
htup = (HeapTupleHeader) PageGetItem(page, lp);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
&htup->t_infomask2);
HeapTupleHeaderSetXmax(htup, xlrec->xmax);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
}
static void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_inplace(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
XLogRecPtr lsn = record->EndRecPtr;
xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
Buffer buffer;
Page page;
OffsetNumber offnum;
ItemId lp = NULL;
HeapTupleHeader htup;
uint32 oldlen;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
Size newlen;
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
Fix multiple problems in WAL replay. Most of the replay functions for WAL record types that modify more than one page failed to ensure that those pages were locked correctly to ensure that concurrent queries could not see inconsistent page states. This is a hangover from coding decisions made long before Hot Standby was added, when it was hardly necessary to acquire buffer locks during WAL replay at all, let alone hold them for carefully-chosen periods. The key problem was that RestoreBkpBlocks was written to hold lock on each page restored from a full-page image for only as long as it took to update that page. This was guaranteed to break any WAL replay function in which there was any update-ordering constraint between pages, because even if the nominal order of the pages is the right one, any mixture of full-page and non-full-page updates in the same record would result in out-of-order updates. Moreover, it wouldn't work for situations where there's a requirement to maintain lock on one page while updating another. Failure to honor an update ordering constraint in this way is thought to be the cause of bug #7648 from Daniel Farina: what seems to have happened there is that a btree page being split was rewritten from a full-page image before the new right sibling page was written, and because lock on the original page was not maintained it was possible for hot standby queries to try to traverse the page's right-link to the not-yet-existing sibling page. To fix, get rid of RestoreBkpBlocks as such, and instead create a new function RestoreBackupBlock that restores just one full-page image at a time. This function can be invoked by WAL replay functions at the points where they would otherwise perform non-full-page updates; in this way, the physical order of page updates remains the same no matter which pages are replaced by full-page images. We can then further adjust the logic in individual replay functions if it is necessary to hold buffer locks for overlapping periods. A side benefit is that we can simplify the handling of concurrency conflict resolution by moving that code into the record-type-specfic functions; there's no more need to contort the code layout to keep conflict resolution in front of the RestoreBkpBlocks call. In connection with that, standardize on zero-based numbering rather than one-based numbering for referencing the full-page images. In HEAD, I removed the macros XLR_BKP_BLOCK_1 through XLR_BKP_BLOCK_4. They are still there in the header files in previous branches, but are no longer used by the code. In addition, fix some other bugs identified in the course of making these changes: spgRedoAddNode could fail to update the parent downlink at all, if the parent tuple is in the same page as either the old or new split tuple and we're not doing a full-page image: it would get fooled by the LSN having been advanced already. This would result in permanent index corruption, not just transient failure of concurrent queries. Also, ginHeapTupleFastInsert's "merge lists" case failed to mark the old tail page as a candidate for a full-page image; in the worst case this could result in torn-page corruption. heap_xlog_freeze() was inconsistent about using a cleanup lock or plain exclusive lock: it did the former in the normal path but the latter for a full-page image. A plain exclusive lock seems sufficient, so change to that. Also, remove gistRedoPageDeleteRecord(), which has been dead code since VACUUM FULL was rewritten. Back-patch to 9.0, where hot standby was introduced. Note however that 9.0 had a significantly different WAL-logging scheme for GIST index updates, and it doesn't appear possible to make that scheme safe for concurrent hot standby queries, because it can leave inconsistent states in the index even between WAL records. Given the lack of complaints from the field, we won't work too hard on fixing that branch.
2012-11-13 04:05:08 +01:00
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
char *newtup = XLogRecGetBlockData(record, 0, &newlen);
page = BufferGetPage(buffer);
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
offnum = xlrec->offnum;
if (PageGetMaxOffsetNumber(page) >= offnum)
lp = PageGetItemId(page, offnum);
if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
elog(PANIC, "invalid lp");
htup = (HeapTupleHeader) PageGetItem(page, lp);
oldlen = ItemIdGetLength(lp) - htup->t_hoff;
if (oldlen != newlen)
elog(PANIC, "wrong tuple length");
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
memcpy((char *) htup + htup->t_hoff, newtup, newlen);
PageSetLSN(page, lsn);
MarkBufferDirty(buffer);
}
if (BufferIsValid(buffer))
UnlockReleaseBuffer(buffer);
}
void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_redo(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* These operations don't overwrite MVCC data so no conflict processing is
* required. The ones in heap2 rmgr do.
*/
switch (info & XLOG_HEAP_OPMASK)
{
case XLOG_HEAP_INSERT:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_insert(record);
break;
case XLOG_HEAP_DELETE:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_delete(record);
break;
case XLOG_HEAP_UPDATE:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_update(record, false);
break;
case XLOG_HEAP_TRUNCATE:
/*
* TRUNCATE is a no-op because the actions are already logged as
* SMGR WAL records. TRUNCATE WAL record only exists for logical
* decoding.
*/
break;
case XLOG_HEAP_HOT_UPDATE:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_update(record, true);
break;
Add support for INSERT ... ON CONFLICT DO NOTHING/UPDATE. The newly added ON CONFLICT clause allows to specify an alternative to raising a unique or exclusion constraint violation error when inserting. ON CONFLICT refers to constraints that can either be specified using a inference clause (by specifying the columns of a unique constraint) or by naming a unique or exclusion constraint. DO NOTHING avoids the constraint violation, without touching the pre-existing row. DO UPDATE SET ... [WHERE ...] updates the pre-existing tuple, and has access to both the tuple proposed for insertion and the existing tuple; the optional WHERE clause can be used to prevent an update from being executed. The UPDATE SET and WHERE clauses have access to the tuple proposed for insertion using the "magic" EXCLUDED alias, and to the pre-existing tuple using the table name or its alias. This feature is often referred to as upsert. This is implemented using a new infrastructure called "speculative insertion". It is an optimistic variant of regular insertion that first does a pre-check for existing tuples and then attempts an insert. If a violating tuple was inserted concurrently, the speculatively inserted tuple is deleted and a new attempt is made. If the pre-check finds a matching tuple the alternative DO NOTHING or DO UPDATE action is taken. If the insertion succeeds without detecting a conflict, the tuple is deemed inserted. To handle the possible ambiguity between the excluded alias and a table named excluded, and for convenience with long relation names, INSERT INTO now can alias its target table. Bumps catversion as stored rules change. Author: Peter Geoghegan, with significant contributions from Heikki Linnakangas and Andres Freund. Testing infrastructure by Jeff Janes. Reviewed-By: Heikki Linnakangas, Andres Freund, Robert Haas, Simon Riggs, Dean Rasheed, Stephen Frost and many others.
2015-05-08 05:31:36 +02:00
case XLOG_HEAP_CONFIRM:
heap_xlog_confirm(record);
break;
case XLOG_HEAP_LOCK:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_lock(record);
break;
case XLOG_HEAP_INPLACE:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_inplace(record);
break;
default:
elog(PANIC, "heap_redo: unknown op code %u", info);
}
}
void
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap2_redo(XLogReaderState *record)
{
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
switch (info & XLOG_HEAP_OPMASK)
{
Remove tupgone special case from vacuumlazy.c. Retry the call to heap_prune_page() in rare cases where there is disagreement between the heap_prune_page() call and the call to HeapTupleSatisfiesVacuum() that immediately follows. Disagreement is possible when a concurrently-aborted transaction makes a tuple DEAD during the tiny window between each step. This was the only case where a tuple considered DEAD by VACUUM still had storage following pruning. VACUUM's definition of dead tuples is now uniformly simple and unambiguous: dead tuples from each page are always LP_DEAD line pointers that were encountered just after we performed pruning (and just before we considered freezing remaining items with tuple storage). Eliminating the tupgone=true special case enables INDEX_CLEANUP=off style skipping of index vacuuming that takes place based on flexible, dynamic criteria. The INDEX_CLEANUP=off case had to know about skipping indexes up-front before now, due to a subtle interaction with the special case (see commit dd695979) -- this was a special case unto itself. Now there are no special cases. And so now it won't matter when or how we decide to skip index vacuuming: it won't affect how pruning behaves, and it won't be affected by any of the implementation details of pruning or freezing. Also remove XLOG_HEAP2_CLEANUP_INFO records. These are no longer necessary because we now rely entirely on heap pruning taking care of recovery conflicts. There is no longer any need to generate recovery conflicts for DEAD tuples that pruning just missed. This also means that heap vacuuming now uses exactly the same strategy for recovery conflicts as index vacuuming always has: REDO routines never need to process a latestRemovedXid from the WAL record, since earlier REDO of the WAL record from pruning is sufficient in all cases. The generic XLOG_HEAP2_CLEAN record type is now split into two new record types to reflect this new division (these are called XLOG_HEAP2_PRUNE and XLOG_HEAP2_VACUUM). Also stop acquiring a super-exclusive lock for heap pages when they're vacuumed during VACUUM's second heap pass. A regular exclusive lock is enough. This is correct because heap page vacuuming is now strictly a matter of setting the LP_DEAD line pointers to LP_UNUSED. No other backend can have a pointer to a tuple located in a pinned buffer that can be invalidated by a concurrent heap page vacuum operation. Heap vacuuming can now be thought of as conceptually similar to index vacuuming and conceptually dissimilar to heap pruning. Heap pruning now has sole responsibility for anything involving the logical contents of the database (e.g., managing transaction status information, recovery conflicts, considering what to do with HOT chains). Index vacuuming and heap vacuuming are now only concerned with recycling garbage items from physical data structures that back the logical database. Bump XLOG_PAGE_MAGIC due to pruning and heap page vacuum WAL record changes. Credit for the idea of retrying pruning a page to avoid the tupgone case goes to Andres Freund. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Andres Freund <andres@anarazel.de> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Discussion: https://postgr.es/m/CAH2-WznneCXTzuFmcwx_EyRQgfsfJAAsu+CsqRFmFXCAar=nJw@mail.gmail.com
2021-04-06 17:49:22 +02:00
case XLOG_HEAP2_PRUNE:
heap_xlog_prune(record);
break;
case XLOG_HEAP2_VACUUM:
heap_xlog_vacuum(record);
break;
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
case XLOG_HEAP2_FREEZE_PAGE:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_freeze_page(record);
Rework tuple freezing protocol Tuple freezing was broken in connection to MultiXactIds; commit 8e53ae025de9 tried to fix it, but didn't go far enough. As noted by Noah Misch, freezing a tuple whose Xmax is a multi containing an aborted update might cause locks in the multi to go ignored by later transactions. This is because the code depended on a multixact above their cutoff point not having any lock-only member older than the cutoff point for Xids, which is easily defeated in READ COMMITTED transactions. The fix for this involves creating a new MultiXactId when necessary. But this cannot be done during WAL replay, and moreover multixact examination requires using CLOG access routines which are not supposed to be used during WAL replay either; so tuple freezing cannot be done with the old freeze WAL record. Therefore, separate the freezing computation from its execution, and change the WAL record to carry all necessary information. At WAL replay time, it's easy to re-execute freezing because we don't need to re-compute the new infomask/Xmax values but just take them from the WAL record. While at it, restructure the coding to ensure all page changes occur in a single critical section without much room for failures. The previous coding wasn't using a critical section, without any explanation as to why this was acceptable. In replication scenarios using the 9.3 branch, standby servers must be upgraded before their master, so that they are prepared to deal with the new WAL record once the master is upgraded; failure to do so will cause WAL replay to die with a PANIC message. Later upgrade of the standby will allow the process to continue where it left off, so there's no disruption of the data in the standby in any case. Standbys know how to deal with the old WAL record, so it's okay to keep the master running the old code for a while. In master, the old freeze WAL record is gone, for cleanliness' sake; there's no compatibility concern there. Backpatch to 9.3, where the original bug was introduced and where the previous fix was backpatched. Álvaro Herrera and Andres Freund
2013-12-16 15:29:50 +01:00
break;
case XLOG_HEAP2_VISIBLE:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_visible(record);
break;
case XLOG_HEAP2_MULTI_INSERT:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_multi_insert(record);
break;
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
case XLOG_HEAP2_LOCK_UPDATED:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_lock_updated(record);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
break;
case XLOG_HEAP2_NEW_CID:
/*
* Nothing to do on a real replay, only used during logical
* decoding.
*/
break;
case XLOG_HEAP2_REWRITE:
Revamp the WAL record format. Each WAL record now carries information about the modified relation and block(s) in a standardized format. That makes it easier to write tools that need that information, like pg_rewind, prefetching the blocks to speed up recovery, etc. There's a whole new API for building WAL records, replacing the XLogRecData chains used previously. The new API consists of XLogRegister* functions, which are called for each buffer and chunk of data that is added to the record. The new API also gives more control over when a full-page image is written, by passing flags to the XLogRegisterBuffer function. This also simplifies the XLogReadBufferForRedo() calls. The function can dig the relation and block number from the WAL record, so they no longer need to be passed as arguments. For the convenience of redo routines, XLogReader now disects each WAL record after reading it, copying the main data part and the per-block data into MAXALIGNed buffers. The data chunks are not aligned within the WAL record, but the redo routines can assume that the pointers returned by XLogRecGet* functions are. Redo routines are now passed the XLogReaderState, which contains the record in the already-disected format, instead of the plain XLogRecord. The new record format also makes the fixed size XLogRecord header smaller, by removing the xl_len field. The length of the "main data" portion is now stored at the end of the WAL record, and there's a separate header after XLogRecord for it. The alignment padding at the end of XLogRecord is also removed. This compansates for the fact that the new format would otherwise be more bulky than the old format. Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera, Fujii Masao.
2014-11-20 16:56:26 +01:00
heap_xlog_logical_rewrite(record);
break;
default:
elog(PANIC, "heap2_redo: unknown op code %u", info);
}
}
/*
* Mask a heap page before performing consistency checks on it.
*/
void
heap_mask(char *pagedata, BlockNumber blkno)
{
Page page = (Page) pagedata;
OffsetNumber off;
mask_page_lsn_and_checksum(page);
mask_page_hint_bits(page);
mask_unused_space(page);
for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
{
ItemId iid = PageGetItemId(page, off);
char *page_item;
page_item = (char *) (page + ItemIdGetOffset(iid));
if (ItemIdIsNormal(iid))
{
HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
/*
* If xmin of a tuple is not yet frozen, we should ignore
* differences in hint bits, since they can be set without
* emitting WAL.
*/
if (!HeapTupleHeaderXminFrozen(page_htup))
page_htup->t_infomask &= ~HEAP_XACT_MASK;
else
{
/* Still we need to mask xmax hint bits. */
page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
}
/*
* During replay, we set Command Id to FirstCommandId. Hence, mask
* it. See heap_xlog_insert() for details.
*/
page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
/*
* For a speculative tuple, heap_insert() does not set ctid in the
* caller-passed heap tuple itself, leaving the ctid field to
* contain a speculative token value - a per-backend monotonically
* increasing identifier. Besides, it does not WAL-log ctid under
* any circumstances.
*
* During redo, heap_xlog_insert() sets t_ctid to current block
* number and self offset number. It doesn't care about any
* speculative insertions on the primary. Hence, we set t_ctid to
* current block number and self offset number to ignore any
* inconsistency.
*/
if (HeapTupleHeaderIsSpeculative(page_htup))
ItemPointerSet(&page_htup->t_ctid, blkno, off);
2018-04-07 22:24:10 +02:00
/*
* NB: Not ignoring ctid changes due to the tuple having moved
* (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
* important information that needs to be in-sync between primary
* and standby, and thus is WAL logged.
*/
}
/*
* Ignore any padding bytes after the tuple, when the length of the
* item is not MAXALIGNed.
*/
if (ItemIdHasStorage(iid))
{
int len = ItemIdGetLength(iid);
int padlen = MAXALIGN(len) - len;
if (padlen > 0)
memset(page_item + len, MASK_MARKER, padlen);
}
}
}
/*
* HeapCheckForSerializableConflictOut
* We are reading a tuple. If it's not visible, there may be a
* rw-conflict out with the inserter. Otherwise, if it is visible to us
* but has been deleted, there may be a rw-conflict out with the deleter.
*
* We will determine the top level xid of the writing transaction with which
* we may be in conflict, and ask CheckForSerializableConflictOut() to check
* for overlap with our own transaction.
*
* This function should be called just about anywhere in heapam.c where a
* tuple has been read. The caller must hold at least a shared lock on the
* buffer, because this function might set hint bits on the tuple. There is
* currently no known reason to call this function from an index AM.
*/
void
HeapCheckForSerializableConflictOut(bool visible, Relation relation,
HeapTuple tuple, Buffer buffer,
Snapshot snapshot)
{
TransactionId xid;
HTSV_Result htsvResult;
if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
return;
/*
* Check to see whether the tuple has been written to by a concurrent
* transaction, either to create it not visible to us, or to delete it
* while it is visible to us. The "visible" bool indicates whether the
* tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
* is going on with it.
*
* In the event of a concurrently inserted tuple that also happens to have
* been concurrently updated (by a separate transaction), the xmin of the
* tuple will be used -- not the updater's xid.
*/
htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
switch (htsvResult)
{
case HEAPTUPLE_LIVE:
if (visible)
return;
xid = HeapTupleHeaderGetXmin(tuple->t_data);
break;
case HEAPTUPLE_RECENTLY_DEAD:
case HEAPTUPLE_DELETE_IN_PROGRESS:
if (visible)
xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
else
xid = HeapTupleHeaderGetXmin(tuple->t_data);
if (TransactionIdPrecedes(xid, TransactionXmin))
{
/* This is like the HEAPTUPLE_DEAD case */
Assert(!visible);
return;
}
break;
case HEAPTUPLE_INSERT_IN_PROGRESS:
xid = HeapTupleHeaderGetXmin(tuple->t_data);
break;
case HEAPTUPLE_DEAD:
Assert(!visible);
return;
default:
/*
* The only way to get to this default clause is if a new value is
* added to the enum type without adding it to this switch
* statement. That's a bug, so elog.
*/
elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
/*
* In spite of having all enum values covered and calling elog on
* this default, some compilers think this is a code path which
* allows xid to be used below without initialization. Silence
* that warning.
*/
xid = InvalidTransactionId;
}
Assert(TransactionIdIsValid(xid));
Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
/*
* Find top level xid. Bail out if xid is too early to be a conflict, or
* if it's our own xid.
*/
if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
return;
xid = SubTransGetTopmostTransaction(xid);
if (TransactionIdPrecedes(xid, TransactionXmin))
return;
CheckForSerializableConflictOut(relation, xid, snapshot);
}