postgresql/src/backend/access/heap/pruneheap.c

825 lines
25 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* pruneheap.c
* heap page pruning and HOT-chain management code
*
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/access/heap/pruneheap.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/heapam_xlog.h"
#include "access/transam.h"
#include "access/htup_details.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "utils/snapmgr.h"
#include "utils/rel.h"
#include "utils/tqual.h"
/* Working data for heap_page_prune and subroutines */
typedef struct
{
TransactionId new_prune_xid; /* new prune hint value for page */
2010-02-26 03:01:40 +01:00
TransactionId latestRemovedXid; /* latest xid to be removed by this
* prune */
int nredirected; /* numbers of entries in arrays below */
int ndead;
int nunused;
/* arrays that accumulate indexes of items to be changed */
OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
OffsetNumber nowdead[MaxHeapTuplesPerPage];
OffsetNumber nowunused[MaxHeapTuplesPerPage];
/* marked[i] is TRUE if item i is entered in one of the above arrays */
bool marked[MaxHeapTuplesPerPage + 1];
} PruneState;
/* Local functions */
2007-11-15 22:14:46 +01:00
static int heap_prune_chain(Relation relation, Buffer buffer,
OffsetNumber rootoffnum,
TransactionId OldestXmin,
PruneState *prstate);
static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
static void heap_prune_record_redirect(PruneState *prstate,
OffsetNumber offnum, OffsetNumber rdoffnum);
static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
/*
* Optionally prune and repair fragmentation in the specified page.
*
* This is an opportunistic function. It will perform housekeeping
* only if the page heuristically looks like a candidate for pruning and we
* can acquire buffer cleanup lock without blocking.
*
* Note: this is called quite often. It's important that it fall out quickly
* if there's not any use in pruning.
*
* Caller must have pin on the buffer, and must *not* have a lock on it.
*
* OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
* or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
*/
void
heap_page_prune_opt(Relation relation, Buffer buffer)
{
Page page = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
Size minfree;
TransactionId OldestXmin;
/*
* We can't write WAL in recovery mode, so there's no point trying to
* clean the page. The master will likely issue a cleaning WAL record soon
* anyway, so this is no particular loss.
*/
if (RecoveryInProgress())
return;
/*
* Use the appropriate xmin horizon for this relation. If it's a proper
* catalog relation or a user defined, additional, catalog relation, we
* need to use the horizon that includes slots, otherwise the data-only
* horizon can be used. Note that the toast relation of user defined
* relations are *not* considered catalog relations.
*/
if (IsCatalogRelation(relation) ||
RelationIsAccessibleInLogicalDecoding(relation))
OldestXmin = RecentGlobalXmin;
else
OldestXmin = RecentGlobalDataXmin;
Assert(TransactionIdIsValid(OldestXmin));
/*
* Let's see if we really need pruning.
*
* Forget it if page is not hinted to contain something prunable that's
* older than OldestXmin.
*/
if (!PageIsPrunable(page, OldestXmin))
return;
/*
2007-11-15 22:14:46 +01:00
* We prune when a previous UPDATE failed to find enough space on the page
* for a new tuple version, or when free space falls below the relation's
* fill-factor target (but not less than 10%).
*
2007-11-15 22:14:46 +01:00
* Checking free space here is questionable since we aren't holding any
* lock on the buffer; in the worst case we could get a bogus answer. It's
* unlikely to be *seriously* wrong, though, since reading either pd_lower
* or pd_upper is probably atomic. Avoiding taking a lock seems more
* important than sometimes getting a wrong answer in what is after all
* just a heuristic estimate.
*/
minfree = RelationGetTargetPageFreeSpace(relation,
HEAP_DEFAULT_FILLFACTOR);
minfree = Max(minfree, BLCKSZ / 10);
if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
{
/* OK, try to get exclusive buffer lock */
if (!ConditionalLockBufferForCleanup(buffer))
return;
/*
* Now that we have buffer lock, get accurate information about the
2007-11-15 22:14:46 +01:00
* page's free space, and recheck the heuristic about whether to
* prune. (We needn't recheck PageIsPrunable, since no one else could
* have pruned while we hold pin.)
*/
if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
{
2010-07-06 21:19:02 +02:00
TransactionId ignore = InvalidTransactionId; /* return value not
* needed */
/* OK to prune */
(void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore);
}
/* And release buffer lock */
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
}
}
/*
* Prune and repair fragmentation in the specified page.
*
* Caller must have pin and buffer cleanup lock on the page.
*
* OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
* or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
*
* If report_stats is true then we send the number of reclaimed heap-only
* tuples to pgstats. (This must be FALSE during vacuum, since vacuum will
* send its own new total to pgstats, and we don't want this delta applied
* on top of that.)
*
* Returns the number of tuples deleted from the page and sets
* latestRemovedXid.
*/
int
heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
bool report_stats, TransactionId *latestRemovedXid)
{
2007-11-15 22:14:46 +01:00
int ndeleted = 0;
Page page = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
2007-11-15 22:14:46 +01:00
OffsetNumber offnum,
maxoff;
PruneState prstate;
2007-11-15 22:14:46 +01:00
/*
* Our strategy is to scan the page and make lists of items to change,
* then apply the changes within a critical section. This keeps as much
* logic as possible out of the critical section, and also ensures that
* WAL replay will work the same as the normal case.
*
* First, initialize the new pd_prune_xid value to zero (indicating no
* prunable tuples). If we find any tuples which may soon become
2010-02-26 03:01:40 +01:00
* prunable, we will save the lowest relevant XID in new_prune_xid. Also
* initialize the rest of our working state.
*/
prstate.new_prune_xid = InvalidTransactionId;
prstate.latestRemovedXid = *latestRemovedXid;
prstate.nredirected = prstate.ndead = prstate.nunused = 0;
memset(prstate.marked, 0, sizeof(prstate.marked));
/* Scan the page */
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum))
{
ItemId itemid;
/* Ignore items already processed as part of an earlier chain */
if (prstate.marked[offnum])
continue;
/* Nothing to do if slot is empty or already dead */
itemid = PageGetItemId(page, offnum);
if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
continue;
/* Process this item or chain of items */
ndeleted += heap_prune_chain(relation, buffer, offnum,
OldestXmin,
&prstate);
}
/* Any error while applying the changes is critical */
START_CRIT_SECTION();
/* Have we found any prunable items? */
if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
{
/*
* Apply the planned item changes, then repair page fragmentation, and
* update the page's hint bit about whether it has free line pointers.
*/
heap_page_prune_execute(buffer,
prstate.redirected, prstate.nredirected,
prstate.nowdead, prstate.ndead,
prstate.nowunused, prstate.nunused);
/*
* Update the page's pd_prune_xid field to either zero, or the lowest
* XID of any soon-prunable tuple.
*/
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
/*
* Also clear the "page is full" flag, since there's no point in
* repeating the prune/defrag process until something else happens to
* the page.
*/
PageClearFull(page);
MarkBufferDirty(buffer);
/*
* Emit a WAL HEAP_CLEAN record showing what we did
*/
if (RelationNeedsWAL(relation))
{
XLogRecPtr recptr;
recptr = log_heap_clean(relation, buffer,
prstate.redirected, prstate.nredirected,
prstate.nowdead, prstate.ndead,
prstate.nowunused, prstate.nunused,
prstate.latestRemovedXid);
PageSetLSN(BufferGetPage(buffer, NULL, NULL,
BGP_NO_SNAPSHOT_TEST), recptr);
}
}
else
{
/*
* If we didn't prune anything, but have found a new value for the
* pd_prune_xid field, update it and mark the buffer dirty. This is
* treated as a non-WAL-logged hint.
*
* Also clear the "page is full" flag if it is set, since there's no
* point in repeating the prune/defrag process until something else
* happens to the page.
*/
if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
PageIsFull(page))
{
((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
PageClearFull(page);
MarkBufferDirtyHint(buffer, true);
}
}
END_CRIT_SECTION();
/*
2007-11-15 22:14:46 +01:00
* If requested, report the number of tuples reclaimed to pgstats. This is
* ndeleted minus ndead, because we don't want to count a now-DEAD root
* item as a deletion for this purpose.
*/
if (report_stats && ndeleted > prstate.ndead)
pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);
*latestRemovedXid = prstate.latestRemovedXid;
/*
* XXX Should we update the FSM information of this page ?
*
2007-11-15 22:14:46 +01:00
* There are two schools of thought here. We may not want to update FSM
* information so that the page is not used for unrelated UPDATEs/INSERTs
* and any free space in this page will remain available for further
* UPDATEs in *this* page, thus improving chances for doing HOT updates.
*
2007-11-15 22:14:46 +01:00
* But for a large table and where a page does not receive further UPDATEs
* for a long time, we might waste this space by not updating the FSM
* information. The relation may get extended and fragmented further.
*
2007-11-15 22:14:46 +01:00
* One possibility is to leave "fillfactor" worth of space in this page
* and update FSM with the remaining space.
*/
return ndeleted;
}
/*
* Prune specified item pointer or a HOT chain originating at that item.
*
* If the item is an index-referenced tuple (i.e. not a heap-only tuple),
* the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
* chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
* This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
* DEAD, the OldestXmin test is just too coarse to detect it.
*
* The root line pointer is redirected to the tuple immediately after the
* latest DEAD tuple. If all tuples in the chain are DEAD, the root line
* pointer is marked LP_DEAD. (This includes the case of a DEAD simple
* tuple, which we treat as a chain of length 1.)
*
* OldestXmin is the cutoff XID used to identify dead tuples.
*
* We don't actually change the page here, except perhaps for hint-bit updates
* caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in
* prstate showing the changes to be made. Items to be redirected are added
* to the redirected[] array (two entries per redirection); items to be set to
* LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
* state are added to nowunused[].
*
* Returns the number of tuples (to be) deleted from the page.
*/
static int
heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
TransactionId OldestXmin,
PruneState *prstate)
{
2007-11-15 22:14:46 +01:00
int ndeleted = 0;
Page dp = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
2007-11-15 22:14:46 +01:00
TransactionId priorXmax = InvalidTransactionId;
ItemId rootlp;
HeapTupleHeader htup;
OffsetNumber latestdead = InvalidOffsetNumber,
maxoff = PageGetMaxOffsetNumber(dp),
offnum;
OffsetNumber chainitems[MaxHeapTuplesPerPage];
int nchain = 0,
i;
HeapTupleData tup;
tup.t_tableOid = RelationGetRelid(relation);
rootlp = PageGetItemId(dp, rootoffnum);
/*
* If it's a heap-only tuple, then it is not the start of a HOT chain.
*/
if (ItemIdIsNormal(rootlp))
{
htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
tup.t_data = htup;
tup.t_len = ItemIdGetLength(rootlp);
ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum);
if (HeapTupleHeaderIsHeapOnly(htup))
{
/*
2007-11-15 22:14:46 +01:00
* If the tuple is DEAD and doesn't chain to anything else, mark
* it unused immediately. (If it does chain, we can only remove
* it as part of pruning its chain.)
*
* We need this primarily to handle aborted HOT updates, that is,
2007-11-15 22:14:46 +01:00
* XMIN_INVALID heap-only tuples. Those might not be linked to by
* any chain, since the parent tuple might be re-updated before
* any pruning occurs. So we have to be able to reap them
* separately from chain-pruning. (Note that
* HeapTupleHeaderIsHotUpdated will never return true for an
* XMIN_INVALID tuple, so this code will work even when there were
* sequential updates within the aborted transaction.)
*
* Note that we might first arrive at a dead heap-only tuple
* either here or while following a chain below. Whichever path
* gets there first will mark the tuple unused.
*/
if (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer)
== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
{
heap_prune_record_unused(prstate, rootoffnum);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
HeapTupleHeaderAdvanceLatestRemovedXid(htup,
2010-02-26 03:01:40 +01:00
&prstate->latestRemovedXid);
ndeleted++;
}
/* Nothing more to do */
return ndeleted;
}
}
/* Start from the root tuple */
offnum = rootoffnum;
/* while not end of the chain */
for (;;)
{
2007-11-15 22:14:46 +01:00
ItemId lp;
bool tupdead,
recent_dead;
/* Some sanity checks */
if (offnum < FirstOffsetNumber || offnum > maxoff)
break;
/* If item is already processed, stop --- it must not be same chain */
if (prstate->marked[offnum])
break;
lp = PageGetItemId(dp, offnum);
/* Unused item obviously isn't part of the chain */
if (!ItemIdIsUsed(lp))
break;
/*
2007-11-15 22:14:46 +01:00
* If we are looking at the redirected root line pointer, jump to the
* first normal tuple in the chain. If we find a redirect somewhere
* else, stop --- it must not be same chain.
*/
if (ItemIdIsRedirected(lp))
{
if (nchain > 0)
break; /* not at start of chain */
chainitems[nchain++] = offnum;
offnum = ItemIdGetRedirect(rootlp);
continue;
}
/*
2007-11-15 22:14:46 +01:00
* Likewise, a dead item pointer can't be part of the chain. (We
* already eliminated the case of dead root tuple outside this
* function.)
*/
if (ItemIdIsDead(lp))
break;
Assert(ItemIdIsNormal(lp));
htup = (HeapTupleHeader) PageGetItem(dp, lp);
tup.t_data = htup;
tup.t_len = ItemIdGetLength(lp);
ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum);
/*
* Check the tuple XMIN against prior XMAX, if any
*/
if (TransactionIdIsValid(priorXmax) &&
!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
break;
/*
* OK, this tuple is indeed a member of the chain.
*/
chainitems[nchain++] = offnum;
/*
* Check tuple's visibility status.
*/
tupdead = recent_dead = false;
switch (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer))
{
case HEAPTUPLE_DEAD:
tupdead = true;
break;
case HEAPTUPLE_RECENTLY_DEAD:
recent_dead = true;
2007-11-15 22:14:46 +01:00
/*
* This tuple may soon become DEAD. Update the hint field so
* that the page is reconsidered for pruning in future.
*/
heap_prune_record_prunable(prstate,
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
HeapTupleHeaderGetUpdateXid(htup));
break;
case HEAPTUPLE_DELETE_IN_PROGRESS:
/*
* This tuple may soon become DEAD. Update the hint field so
* that the page is reconsidered for pruning in future.
*/
heap_prune_record_prunable(prstate,
HeapTupleHeaderGetUpdateXid(htup));
break;
case HEAPTUPLE_LIVE:
case HEAPTUPLE_INSERT_IN_PROGRESS:
2007-11-15 22:14:46 +01:00
/*
* If we wanted to optimize for aborts, we might consider
* marking the page prunable when we see INSERT_IN_PROGRESS.
2007-11-15 22:14:46 +01:00
* But we don't. See related decisions about when to mark the
* page prunable in heapam.c.
*/
break;
default:
elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
break;
}
/*
* Remember the last DEAD tuple seen. We will advance past
* RECENTLY_DEAD tuples just in case there's a DEAD one after them;
* but we can't advance past anything else. (XXX is it really worth
* continuing to scan beyond RECENTLY_DEAD? The case where we will
* find another DEAD tuple is a fairly unusual corner case.)
*/
if (tupdead)
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
{
latestdead = offnum;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
HeapTupleHeaderAdvanceLatestRemovedXid(htup,
2010-02-26 03:01:40 +01:00
&prstate->latestRemovedXid);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
}
else if (!recent_dead)
break;
/*
* If the tuple is not HOT-updated, then we are at the end of this
* HOT-update chain.
*/
if (!HeapTupleHeaderIsHotUpdated(htup))
break;
/*
* Advance to next chain member.
*/
Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
BufferGetBlockNumber(buffer));
offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
priorXmax = HeapTupleHeaderGetUpdateXid(htup);
}
/*
* If we found a DEAD tuple in the chain, adjust the HOT chain so that all
* the DEAD tuples at the start of the chain are removed and the root line
* pointer is appropriately redirected.
*/
if (OffsetNumberIsValid(latestdead))
{
/*
* Mark as unused each intermediate item that we are able to remove
* from the chain.
*
2007-11-15 22:14:46 +01:00
* When the previous item is the last dead tuple seen, we are at the
* right candidate for redirection.
*/
for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
{
heap_prune_record_unused(prstate, chainitems[i]);
ndeleted++;
}
/*
2007-11-15 22:14:46 +01:00
* If the root entry had been a normal tuple, we are deleting it, so
* count it in the result. But changing a redirect (even to DEAD
2007-11-15 22:14:46 +01:00
* state) doesn't count.
*/
if (ItemIdIsNormal(rootlp))
ndeleted++;
/*
* If the DEAD tuple is at the end of the chain, the entire chain is
2007-11-15 22:14:46 +01:00
* dead and the root line pointer can be marked dead. Otherwise just
* redirect the root to the correct chain member.
*/
if (i >= nchain)
heap_prune_record_dead(prstate, rootoffnum);
else
heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
}
else if (nchain < 2 && ItemIdIsRedirected(rootlp))
{
/*
* We found a redirect item that doesn't point to a valid follow-on
2007-11-15 22:14:46 +01:00
* item. This can happen if the loop in heap_page_prune caused us to
* visit the dead successor of a redirect item before visiting the
* redirect item. We can clean up by setting the redirect item to
* DEAD state.
*/
heap_prune_record_dead(prstate, rootoffnum);
}
return ndeleted;
}
/* Record lowest soon-prunable XID */
static void
heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
{
/*
* This should exactly match the PageSetPrunable macro. We can't store
* directly into the page header yet, so we update working state.
*/
Assert(TransactionIdIsNormal(xid));
if (!TransactionIdIsValid(prstate->new_prune_xid) ||
TransactionIdPrecedes(xid, prstate->new_prune_xid))
prstate->new_prune_xid = xid;
}
/* Record item pointer to be redirected */
static void
heap_prune_record_redirect(PruneState *prstate,
2007-11-15 22:14:46 +01:00
OffsetNumber offnum, OffsetNumber rdoffnum)
{
Assert(prstate->nredirected < MaxHeapTuplesPerPage);
prstate->redirected[prstate->nredirected * 2] = offnum;
prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
prstate->nredirected++;
Assert(!prstate->marked[offnum]);
prstate->marked[offnum] = true;
Assert(!prstate->marked[rdoffnum]);
prstate->marked[rdoffnum] = true;
}
/* Record item pointer to be marked dead */
static void
heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
{
Assert(prstate->ndead < MaxHeapTuplesPerPage);
prstate->nowdead[prstate->ndead] = offnum;
prstate->ndead++;
Assert(!prstate->marked[offnum]);
prstate->marked[offnum] = true;
}
/* Record item pointer to be marked unused */
static void
heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
{
Assert(prstate->nunused < MaxHeapTuplesPerPage);
prstate->nowunused[prstate->nunused] = offnum;
prstate->nunused++;
Assert(!prstate->marked[offnum]);
prstate->marked[offnum] = true;
}
/*
* Perform the actual page changes needed by heap_page_prune.
* It is expected that the caller has suitable pin and lock on the
* buffer, and is inside a critical section.
*
* This is split out because it is also used by heap_xlog_clean()
* to replay the WAL record when needed after a crash. Note that the
* arguments are identical to those of log_heap_clean().
*/
void
heap_page_prune_execute(Buffer buffer,
OffsetNumber *redirected, int nredirected,
OffsetNumber *nowdead, int ndead,
OffsetNumber *nowunused, int nunused)
{
Page page = BufferGetPage(buffer, NULL, NULL,
BGP_NO_SNAPSHOT_TEST);
OffsetNumber *offnum;
int i;
/* Update all redirected line pointers */
offnum = redirected;
for (i = 0; i < nredirected; i++)
{
OffsetNumber fromoff = *offnum++;
OffsetNumber tooff = *offnum++;
ItemId fromlp = PageGetItemId(page, fromoff);
ItemIdSetRedirect(fromlp, tooff);
}
/* Update all now-dead line pointers */
offnum = nowdead;
for (i = 0; i < ndead; i++)
{
OffsetNumber off = *offnum++;
ItemId lp = PageGetItemId(page, off);
ItemIdSetDead(lp);
}
/* Update all now-unused line pointers */
offnum = nowunused;
for (i = 0; i < nunused; i++)
{
OffsetNumber off = *offnum++;
ItemId lp = PageGetItemId(page, off);
ItemIdSetUnused(lp);
}
/*
* Finally, repair any fragmentation, and update the page's hint bit about
* whether it has free pointers.
*/
PageRepairFragmentation(page);
}
/*
* For all items in this page, find their respective root line pointers.
* If item k is part of a HOT-chain with root at item j, then we set
* root_offsets[k - 1] = j.
*
* The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
* We zero out all unused entries.
*
* The function must be called with at least share lock on the buffer, to
* prevent concurrent prune operations.
*
* Note: The information collected here is valid only as long as the caller
* holds a pin on the buffer. Once pin is released, a tuple might be pruned
* and reused by a completely unrelated tuple.
*/
void
heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
{
2007-11-15 22:14:46 +01:00
OffsetNumber offnum,
maxoff;
MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
maxoff = PageGetMaxOffsetNumber(page);
for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
{
2007-11-15 22:14:46 +01:00
ItemId lp = PageGetItemId(page, offnum);
HeapTupleHeader htup;
OffsetNumber nextoffnum;
TransactionId priorXmax;
/* skip unused and dead items */
if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
continue;
if (ItemIdIsNormal(lp))
{
htup = (HeapTupleHeader) PageGetItem(page, lp);
/*
* Check if this tuple is part of a HOT-chain rooted at some other
* tuple. If so, skip it for now; we'll process it when we find
* its root.
*/
if (HeapTupleHeaderIsHeapOnly(htup))
continue;
/*
* This is either a plain tuple or the root of a HOT-chain.
* Remember it in the mapping.
*/
root_offsets[offnum - 1] = offnum;
/* If it's not the start of a HOT-chain, we're done with it */
if (!HeapTupleHeaderIsHotUpdated(htup))
continue;
/* Set up to scan the HOT-chain */
nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
priorXmax = HeapTupleHeaderGetUpdateXid(htup);
}
else
{
/* Must be a redirect item. We do not set its root_offsets entry */
Assert(ItemIdIsRedirected(lp));
/* Set up to scan the HOT-chain */
nextoffnum = ItemIdGetRedirect(lp);
priorXmax = InvalidTransactionId;
}
/*
* Now follow the HOT-chain and collect other tuples in the chain.
*
* Note: Even though this is a nested loop, the complexity of the
* function is O(N) because a tuple in the page should be visited not
* more than twice, once in the outer loop and once in HOT-chain
* chases.
*/
for (;;)
{
lp = PageGetItemId(page, nextoffnum);
/* Check for broken chains */
if (!ItemIdIsNormal(lp))
break;
htup = (HeapTupleHeader) PageGetItem(page, lp);
if (TransactionIdIsValid(priorXmax) &&
!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
break;
/* Remember the root line pointer for this item */
root_offsets[nextoffnum - 1] = offnum;
/* Advance to next chain member, if any */
if (!HeapTupleHeaderIsHotUpdated(htup))
break;
nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
Improve concurrency of foreign key locking This patch introduces two additional lock modes for tuples: "SELECT FOR KEY SHARE" and "SELECT FOR NO KEY UPDATE". These don't block each other, in contrast with already existing "SELECT FOR SHARE" and "SELECT FOR UPDATE". UPDATE commands that do not modify the values stored in the columns that are part of the key of the tuple now grab a SELECT FOR NO KEY UPDATE lock on the tuple, allowing them to proceed concurrently with tuple locks of the FOR KEY SHARE variety. Foreign key triggers now use FOR KEY SHARE instead of FOR SHARE; this means the concurrency improvement applies to them, which is the whole point of this patch. The added tuple lock semantics require some rejiggering of the multixact module, so that the locking level that each transaction is holding can be stored alongside its Xid. Also, multixacts now need to persist across server restarts and crashes, because they can now represent not only tuple locks, but also tuple updates. This means we need more careful tracking of lifetime of pg_multixact SLRU files; since they now persist longer, we require more infrastructure to figure out when they can be removed. pg_upgrade also needs to be careful to copy pg_multixact files over from the old server to the new, or at least part of multixact.c state, depending on the versions of the old and new servers. Tuple time qualification rules (HeapTupleSatisfies routines) need to be careful not to consider tuples with the "is multi" infomask bit set as being only locked; they might need to look up MultiXact values (i.e. possibly do pg_multixact I/O) to find out the Xid that updated a tuple, whereas they previously were assured to only use information readily available from the tuple header. This is considered acceptable, because the extra I/O would involve cases that would previously cause some commands to block waiting for concurrent transactions to finish. Another important change is the fact that locking tuples that have previously been updated causes the future versions to be marked as locked, too; this is essential for correctness of foreign key checks. This causes additional WAL-logging, also (there was previously a single WAL record for a locked tuple; now there are as many as updated copies of the tuple there exist.) With all this in place, contention related to tuples being checked by foreign key rules should be much reduced. As a bonus, the old behavior that a subtransaction grabbing a stronger tuple lock than the parent (sub)transaction held on a given tuple and later aborting caused the weaker lock to be lost, has been fixed. Many new spec files were added for isolation tester framework, to ensure overall behavior is sane. There's probably room for several more tests. There were several reviewers of this patch; in particular, Noah Misch and Andres Freund spent considerable time in it. Original idea for the patch came from Simon Riggs, after a problem report by Joel Jacobson. Most code is from me, with contributions from Marti Raudsepp, Alexander Shulgin, Noah Misch and Andres Freund. This patch was discussed in several pgsql-hackers threads; the most important start at the following message-ids: AANLkTimo9XVcEzfiBR-ut3KVNDkjm2Vxh+t8kAmWjPuv@mail.gmail.com 1290721684-sup-3951@alvh.no-ip.org 1294953201-sup-2099@alvh.no-ip.org 1320343602-sup-2290@alvh.no-ip.org 1339690386-sup-8927@alvh.no-ip.org 4FE5FF020200002500048A3D@gw.wicourts.gov 4FEAB90A0200002500048B7D@gw.wicourts.gov
2013-01-23 16:04:59 +01:00
priorXmax = HeapTupleHeaderGetUpdateXid(htup);
}
}
}