Fix index deletion latestRemovedXid bug.

The logic for determining the latest removed XID for the purposes of generating recovery conflicts in REDO routines was subtly broken. It failed to follow links from HOT chains, and so failed to consider all relevant heap tuple headers in some cases. To fix, expand the loop that deals with LP_REDIRECT line pointers to also deal with HOT chains. The new version of the loop is loosely based on a similar loop from heap_prune_chain(). The impact of this bug is probably quite limited, since the horizon code necessarily deals with heap tuples that are pointed to by LP_DEAD-set index tuples. The process of setting LP_DEAD index tuples (e.g. within the kill_prior_tuple mechanism) is highly correlated with opportunistic pruning of pointed-to heap tuples. Plus the question of generating a recovery conflict usually comes up some time after index tuple LP_DEAD bits were initially set, unlike heap pruning, where a latestRemovedXid is generated at the point of the pruning operation (heap pruning has no deferred "would-be page split" style processing that produces conflicts lazily). Only backpatch to Postgres 12, the first version where this logic runs during original execution (following commit 558a9165e0). The index latestRemovedXid mechanism has had the same bug since it first appeared over 10 years ago (in commit a760893d), but backpatching to all supported versions now seems like a bad idea on balance. Running the new improved code during recovery seems risky, especially given the lack of complaints from the field. Author: Peter Geoghegan <pg@bowt.ie> Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com Backpatch: 12-
2020-12-30 16:29:05 -08:00 · 2020-12-30 16:29:05 -08:00 · 4228817449
parent 319f4d54e8
commit 4228817449
2 changed files with 79 additions and 55 deletions
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@ -6997,10 +6997,13 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 									ItemPointerData *tids,
 									int nitems)
 {
+	/* Initial assumption is that earlier pruning took care of conflict */
 	TransactionId latestRemovedXid = InvalidTransactionId;
-	BlockNumber hblkno;
+	BlockNumber blkno = InvalidBlockNumber;
 	Buffer		buf = InvalidBuffer;
-	Page		hpage;
+	Page		page = NULL;
+	OffsetNumber maxoff = InvalidOffsetNumber;
+	TransactionId priorXmax;
 #ifdef USE_PREFETCH
 	XidHorizonPrefetchState prefetch_state;
 	int			prefetch_distance;
@ -7040,20 +7043,17 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 #endif

 	/* Iterate over all tids, and check their horizon */
-	hblkno = InvalidBlockNumber;
-	hpage = NULL;
 	for (int i = 0; i < nitems; i++)
 	{
 		ItemPointer htid = &tids[i];
-		ItemId		hitemid;
-		OffsetNumber hoffnum;
+		OffsetNumber offnum;

 		/*
 		 * Read heap buffer, but avoid refetching if it's the same block as
 		 * required for the last tid.
 		 */
-		if (hblkno == InvalidBlockNumber ||
-			ItemPointerGetBlockNumber(htid) != hblkno)
+		if (blkno == InvalidBlockNumber ||
+			ItemPointerGetBlockNumber(htid) != blkno)
 		{
 			/* release old buffer */
 			if (BufferIsValid(buf))
@ -7062,9 +7062,9 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 				ReleaseBuffer(buf);
 			}

-			hblkno = ItemPointerGetBlockNumber(htid);
+			blkno = ItemPointerGetBlockNumber(htid);

-			buf = ReadBuffer(rel, hblkno);
+			buf = ReadBuffer(rel, blkno);

 #ifdef USE_PREFETCH

@ -7075,49 +7075,79 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 			xid_horizon_prefetch_buffer(rel, &prefetch_state, 1);
 #endif

-			hpage = BufferGetPage(buf);
+			page = BufferGetPage(buf);
+			maxoff = PageGetMaxOffsetNumber(page);

 			LockBuffer(buf, BUFFER_LOCK_SHARE);
 		}

-		hoffnum = ItemPointerGetOffsetNumber(htid);
-		hitemid = PageGetItemId(hpage, hoffnum);
-
 		/*
-		 * Follow any redirections until we find something useful.
+		 * Maintain latestRemovedXid value for deletion operation as a whole
+		 * by advancing current value using heap tuple headers.  This is
+		 * loosely based on the logic for pruning a HOT chain.
 		 */
-		while (ItemIdIsRedirected(hitemid))
+		offnum = ItemPointerGetOffsetNumber(htid);
+		priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
+		for (;;)
 		{
-			hoffnum = ItemIdGetRedirect(hitemid);
-			hitemid = PageGetItemId(hpage, hoffnum);
-		}
+			ItemId		lp;
+			HeapTupleHeader htup;

-		/*
-		 * If the heap item has storage, then read the header and use that to
-		 * set latestRemovedXid.
-		 *
-		 * Some LP_DEAD items may not be accessible, so we ignore them.
-		 */
-		if (ItemIdHasStorage(hitemid))
-		{
-			HeapTupleHeader htuphdr;
+			/* Some sanity checks */
+			if (offnum < FirstOffsetNumber || offnum > maxoff)
+			{
+				Assert(false);
+				break;
+			}

-			htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+			lp = PageGetItemId(page, offnum);
+			if (ItemIdIsRedirected(lp))
+			{
+				offnum = ItemIdGetRedirect(lp);
+				continue;
+			}

-			HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
-		}
-		else if (ItemIdIsDead(hitemid))
-		{
 			/*
-			 * Conjecture: if hitemid is dead then it had xids before the xids
-			 * marked on LP_NORMAL items. So we just ignore this item and move
-			 * onto the next, for the purposes of calculating
-			 * latestRemovedXid.
+			 * We'll often encounter LP_DEAD line pointers.  No need to do
+			 * anything more with htid when that happens.  This is okay
+			 * because the earlier pruning operation that made the line
+			 * pointer LP_DEAD in the first place must have considered the
+			 * tuple header as part of generating its own latestRemovedXid
+			 * value.
+			 *
+			 * Relying on XLOG_HEAP2_CLEANUP_INFO records like this is the
+			 * same strategy that index vacuuming uses in all cases.  Index
+			 * VACUUM WAL records don't even have a latestRemovedXid field of
+			 * their own for this reason.
 			 */
-		}
-		else
-			Assert(!ItemIdIsUsed(hitemid));
+			if (!ItemIdIsNormal(lp))
+				break;

+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			/*
+			 * Check the tuple XMIN against prior XMAX, if any
+			 */
+			if (TransactionIdIsValid(priorXmax) &&
+				!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
+				break;
+
+			HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid);
+
+			/*
+			 * If the tuple is not HOT-updated, then we are at the end of this
+			 * HOT-chain.  No need to visit later tuples from the same update
+			 * chain (they get their own index entries) -- just move on to
+			 * next htid from index AM caller.
+			 */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				break;
+
+			/* Advance to next HOT chain member */
+			Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
+			offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
+		}
 	}

 	if (BufferIsValid(buf))
@ -7126,14 +7156,6 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 		ReleaseBuffer(buf);
 	}

-	/*
-	 * If all heap tuples were LP_DEAD then we will be returning
-	 * InvalidTransactionId here, which avoids conflicts. This matches
-	 * existing logic which assumes that LP_DEAD tuples must already be older
-	 * than the latestRemovedXid on the cleanup record that set them as
-	 * LP_DEAD, hence must already have generated a conflict.
-	 */
-
 	return latestRemovedXid;
 }

--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@ -305,13 +305,15 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode
 	VirtualTransactionId *backends;

 	/*
-	 * If we get passed InvalidTransactionId then we are a little surprised,
-	 * but it is theoretically possible in normal running. It also happens
-	 * when replaying already applied WAL records after a standby crash or
-	 * restart, or when replaying an XLOG_HEAP2_VISIBLE record that marks as
-	 * frozen a page which was already all-visible.  If latestRemovedXid is
-	 * invalid then there is no conflict. That rule applies across all record
-	 * types that suffer from this conflict.
+	 * If we get passed InvalidTransactionId then we do nothing (no conflict).
+	 *
+	 * This can happen when replaying already-applied WAL records after a
+	 * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
+	 * record that marks as frozen a page which was already all-visible.  It's
+	 * also quite common with records generated during index deletion
+	 * (original execution of the deletion can reason that a recovery conflict
+	 * which is sufficient for the deletion operation must take place before
+	 * replay of the deletion record itself).
 	 */
 	if (!TransactionIdIsValid(latestRemovedXid))
 		return;