From 422881744997417944634a7f84af7a66a608de9a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 30 Dec 2020 16:29:05 -0800
Subject: [PATCH] Fix index deletion latestRemovedXid bug.

The logic for determining the latest removed XID for the purposes of
generating recovery conflicts in REDO routines was subtly broken.  It
failed to follow links from HOT chains, and so failed to consider all
relevant heap tuple headers in some cases.

To fix, expand the loop that deals with LP_REDIRECT line pointers to
also deal with HOT chains.  The new version of the loop is loosely based
on a similar loop from heap_prune_chain().

The impact of this bug is probably quite limited, since the horizon code
necessarily deals with heap tuples that are pointed to by LP_DEAD-set
index tuples.  The process of setting LP_DEAD index tuples (e.g. within
the kill_prior_tuple mechanism) is highly correlated with opportunistic
pruning of pointed-to heap tuples.  Plus the question of generating a
recovery conflict usually comes up some time after index tuple LP_DEAD
bits were initially set, unlike heap pruning, where a latestRemovedXid
is generated at the point of the pruning operation (heap pruning has no
deferred "would-be page split" style processing that produces conflicts
lazily).

Only backpatch to Postgres 12, the first version where this logic runs
during original execution (following commit 558a9165e08).  The index
latestRemovedXid mechanism has had the same bug since it first appeared
over 10 years ago (in commit a760893d), but backpatching to all
supported versions now seems like a bad idea on balance.  Running the
new improved code during recovery seems risky, especially given the lack
of complaints from the field.

Author: Peter Geoghegan <pg@bowt.ie>
Discussion: https://postgr.es/m/CAH2-Wz=Eib393+HHcERK_9MtgNS7Ew1HY=RDC_g6GL46zM5C6Q@mail.gmail.com
Backpatch: 12-
---
 src/backend/access/heap/heapam.c  | 118 ++++++++++++++++++------------
 src/backend/storage/ipc/standby.c |  16 ++--
 2 files changed, 79 insertions(+), 55 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index a9583f3103..da27903cb4 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6997,10 +6997,13 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 									ItemPointerData *tids,
 									int nitems)
 {
+	/* Initial assumption is that earlier pruning took care of conflict */
 	TransactionId latestRemovedXid = InvalidTransactionId;
-	BlockNumber hblkno;
+	BlockNumber blkno = InvalidBlockNumber;
 	Buffer		buf = InvalidBuffer;
-	Page		hpage;
+	Page		page = NULL;
+	OffsetNumber maxoff = InvalidOffsetNumber;
+	TransactionId priorXmax;
 #ifdef USE_PREFETCH
 	XidHorizonPrefetchState prefetch_state;
 	int			prefetch_distance;
@@ -7040,20 +7043,17 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 #endif
 
 	/* Iterate over all tids, and check their horizon */
-	hblkno = InvalidBlockNumber;
-	hpage = NULL;
 	for (int i = 0; i < nitems; i++)
 	{
 		ItemPointer htid = &tids[i];
-		ItemId		hitemid;
-		OffsetNumber hoffnum;
+		OffsetNumber offnum;
 
 		/*
 		 * Read heap buffer, but avoid refetching if it's the same block as
 		 * required for the last tid.
 		 */
-		if (hblkno == InvalidBlockNumber ||
-			ItemPointerGetBlockNumber(htid) != hblkno)
+		if (blkno == InvalidBlockNumber ||
+			ItemPointerGetBlockNumber(htid) != blkno)
 		{
 			/* release old buffer */
 			if (BufferIsValid(buf))
@@ -7062,9 +7062,9 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 				ReleaseBuffer(buf);
 			}
 
-			hblkno = ItemPointerGetBlockNumber(htid);
+			blkno = ItemPointerGetBlockNumber(htid);
 
-			buf = ReadBuffer(rel, hblkno);
+			buf = ReadBuffer(rel, blkno);
 
 #ifdef USE_PREFETCH
 
@@ -7075,49 +7075,79 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 			xid_horizon_prefetch_buffer(rel, &prefetch_state, 1);
 #endif
 
-			hpage = BufferGetPage(buf);
+			page = BufferGetPage(buf);
+			maxoff = PageGetMaxOffsetNumber(page);
 
 			LockBuffer(buf, BUFFER_LOCK_SHARE);
 		}
 
-		hoffnum = ItemPointerGetOffsetNumber(htid);
-		hitemid = PageGetItemId(hpage, hoffnum);
-
 		/*
-		 * Follow any redirections until we find something useful.
+		 * Maintain latestRemovedXid value for deletion operation as a whole
+		 * by advancing current value using heap tuple headers.  This is
+		 * loosely based on the logic for pruning a HOT chain.
 		 */
-		while (ItemIdIsRedirected(hitemid))
+		offnum = ItemPointerGetOffsetNumber(htid);
+		priorXmax = InvalidTransactionId;	/* cannot check first XMIN */
+		for (;;)
 		{
-			hoffnum = ItemIdGetRedirect(hitemid);
-			hitemid = PageGetItemId(hpage, hoffnum);
-		}
+			ItemId		lp;
+			HeapTupleHeader htup;
 
-		/*
-		 * If the heap item has storage, then read the header and use that to
-		 * set latestRemovedXid.
-		 *
-		 * Some LP_DEAD items may not be accessible, so we ignore them.
-		 */
-		if (ItemIdHasStorage(hitemid))
-		{
-			HeapTupleHeader htuphdr;
+			/* Some sanity checks */
+			if (offnum < FirstOffsetNumber || offnum > maxoff)
+			{
+				Assert(false);
+				break;
+			}
 
-			htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+			lp = PageGetItemId(page, offnum);
+			if (ItemIdIsRedirected(lp))
+			{
+				offnum = ItemIdGetRedirect(lp);
+				continue;
+			}
 
-			HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid);
-		}
-		else if (ItemIdIsDead(hitemid))
-		{
 			/*
-			 * Conjecture: if hitemid is dead then it had xids before the xids
-			 * marked on LP_NORMAL items. So we just ignore this item and move
-			 * onto the next, for the purposes of calculating
-			 * latestRemovedXid.
+			 * We'll often encounter LP_DEAD line pointers.  No need to do
+			 * anything more with htid when that happens.  This is okay
+			 * because the earlier pruning operation that made the line
+			 * pointer LP_DEAD in the first place must have considered the
+			 * tuple header as part of generating its own latestRemovedXid
+			 * value.
+			 *
+			 * Relying on XLOG_HEAP2_CLEANUP_INFO records like this is the
+			 * same strategy that index vacuuming uses in all cases.  Index
+			 * VACUUM WAL records don't even have a latestRemovedXid field of
+			 * their own for this reason.
 			 */
-		}
-		else
-			Assert(!ItemIdIsUsed(hitemid));
+			if (!ItemIdIsNormal(lp))
+				break;
 
+			htup = (HeapTupleHeader) PageGetItem(page, lp);
+
+			/*
+			 * Check the tuple XMIN against prior XMAX, if any
+			 */
+			if (TransactionIdIsValid(priorXmax) &&
+				!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
+				break;
+
+			HeapTupleHeaderAdvanceLatestRemovedXid(htup, &latestRemovedXid);
+
+			/*
+			 * If the tuple is not HOT-updated, then we are at the end of this
+			 * HOT-chain.  No need to visit later tuples from the same update
+			 * chain (they get their own index entries) -- just move on to
+			 * next htid from index AM caller.
+			 */
+			if (!HeapTupleHeaderIsHotUpdated(htup))
+				break;
+
+			/* Advance to next HOT chain member */
+			Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
+			offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
+			priorXmax = HeapTupleHeaderGetUpdateXid(htup);
+		}
 	}
 
 	if (BufferIsValid(buf))
@@ -7126,14 +7156,6 @@ heap_compute_xid_horizon_for_tuples(Relation rel,
 		ReleaseBuffer(buf);
 	}
 
-	/*
-	 * If all heap tuples were LP_DEAD then we will be returning
-	 * InvalidTransactionId here, which avoids conflicts. This matches
-	 * existing logic which assumes that LP_DEAD tuples must already be older
-	 * than the latestRemovedXid on the cleanup record that set them as
-	 * LP_DEAD, hence must already have generated a conflict.
-	 */
-
 	return latestRemovedXid;
 }
 
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 92d9027776..da8e83a525 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -305,13 +305,15 @@ ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode
 	VirtualTransactionId *backends;
 
 	/*
-	 * If we get passed InvalidTransactionId then we are a little surprised,
-	 * but it is theoretically possible in normal running. It also happens
-	 * when replaying already applied WAL records after a standby crash or
-	 * restart, or when replaying an XLOG_HEAP2_VISIBLE record that marks as
-	 * frozen a page which was already all-visible.  If latestRemovedXid is
-	 * invalid then there is no conflict. That rule applies across all record
-	 * types that suffer from this conflict.
+	 * If we get passed InvalidTransactionId then we do nothing (no conflict).
+	 *
+	 * This can happen when replaying already-applied WAL records after a
+	 * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
+	 * record that marks as frozen a page which was already all-visible.  It's
+	 * also quite common with records generated during index deletion
+	 * (original execution of the deletion can reason that a recovery conflict
+	 * which is sufficient for the deletion operation must take place before
+	 * replay of the deletion record itself).
 	 */
 	if (!TransactionIdIsValid(latestRemovedXid))
 		return;