From a760893dbda9934e287789d54bbd3c4ca3914ce0 Mon Sep 17 00:00:00 2001 From: Simon Riggs Date: Sun, 28 Mar 2010 09:27:02 +0000 Subject: [PATCH] Derive latestRemovedXid for btree deletes by reading heap pages. The WAL record for btree delete contains a list of tids, even when backup blocks are present. We follow the tids to their heap tuples, taking care to follow LP_REDIRECT tuples. We ignore LP_DEAD tuples on the understanding that they will always have xmin/xmax earlier than any LP_NORMAL tuples referred to by killed index tuples. Iff all tuples are LP_DEAD we return InvalidTransactionId. The heap relfilenode is added to the WAL record, requiring API changes to pass down the heap Relation. XLOG_PAGE_MAGIC updated. --- src/backend/access/nbtree/nbtinsert.c | 18 ++-- src/backend/access/nbtree/nbtpage.c | 122 ++++++++++++++------- src/backend/access/nbtree/nbtree.c | 6 +- src/backend/access/nbtree/nbtxlog.c | 150 ++++++++++++++++++++++++-- src/include/access/nbtree.h | 25 ++--- src/include/access/xlog_internal.h | 4 +- 6 files changed, 254 insertions(+), 71 deletions(-) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index de9bd95f88..cd70a4c73e 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.177 2010/02/26 02:00:34 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.178 2010/03/28 09:27:01 sriggs Exp $ * *------------------------------------------------------------------------- */ @@ -57,7 +57,8 @@ static void _bt_findinsertloc(Relation rel, OffsetNumber *offsetptr, int keysz, ScanKey scankey, - IndexTuple newtup); + IndexTuple newtup, + Relation heapRel); static void _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, IndexTuple itup, @@ -78,7 +79,7 @@ static void _bt_pgaddtup(Relation rel, Page page, OffsetNumber itup_off, const char *where); static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, int keysz, ScanKey scankey); -static void _bt_vacuum_one_page(Relation rel, Buffer buffer); +static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); /* @@ -175,7 +176,7 @@ top: if (checkUnique != UNIQUE_CHECK_EXISTING) { /* do the insertion */ - _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup); + _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel); _bt_insertonpg(rel, buf, stack, itup, offset, false); } else @@ -491,7 +492,8 @@ _bt_findinsertloc(Relation rel, OffsetNumber *offsetptr, int keysz, ScanKey scankey, - IndexTuple newtup) + IndexTuple newtup, + Relation heapRel) { Buffer buf = *bufptr; Page page = BufferGetPage(buf); @@ -556,7 +558,7 @@ _bt_findinsertloc(Relation rel, */ if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop)) { - _bt_vacuum_one_page(rel, buf); + _bt_vacuum_one_page(rel, buf, heapRel); /* * remember that we vacuumed this page, because that makes the @@ -1998,7 +2000,7 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum, * super-exclusive "cleanup" lock (see nbtree/README). */ static void -_bt_vacuum_one_page(Relation rel, Buffer buffer) +_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; @@ -2025,7 +2027,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer) } if (ndeletable > 0) - _bt_delitems(rel, buffer, deletable, ndeletable, false, 0); + _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel); /* * Note: if we didn't find any LP_DEAD items, then the page's diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 4aa7599351..cb94c76bcc 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.121 2010/03/19 10:41:21 sriggs Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.122 2010/03/28 09:27:01 sriggs Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -719,15 +719,12 @@ _bt_page_recyclable(Page page) * ensure correct locking. */ void -_bt_delitems(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, bool isVacuum, - BlockNumber lastBlockVacuumed) +_bt_delitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed) { Page page = BufferGetPage(buf); BTPageOpaque opaque; - Assert(isVacuum || lastBlockVacuumed == 0); - /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); @@ -759,35 +756,14 @@ _bt_delitems(Relation rel, Buffer buf, XLogRecPtr recptr; XLogRecData rdata[2]; - if (isVacuum) - { - xl_btree_vacuum xlrec_vacuum; + xl_btree_vacuum xlrec_vacuum; - xlrec_vacuum.node = rel->rd_node; - xlrec_vacuum.block = BufferGetBlockNumber(buf); - - xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; - rdata[0].data = (char *) &xlrec_vacuum; - rdata[0].len = SizeOfBtreeVacuum; - } - else - { - xl_btree_delete xlrec_delete; - - xlrec_delete.node = rel->rd_node; - xlrec_delete.block = BufferGetBlockNumber(buf); - - /* - * XXX: We would like to set an accurate latestRemovedXid, but - * there is no easy way of obtaining a useful value. So we punt - * and store InvalidTransactionId, which forces the standby to - * wait for/cancel all currently running transactions. - */ - xlrec_delete.latestRemovedXid = InvalidTransactionId; - rdata[0].data = (char *) &xlrec_delete; - rdata[0].len = SizeOfBtreeDelete; - } + xlrec_vacuum.node = rel->rd_node; + xlrec_vacuum.block = BufferGetBlockNumber(buf); + xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; + rdata[0].data = (char *) &xlrec_vacuum; + rdata[0].len = SizeOfBtreeVacuum; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); @@ -810,10 +786,82 @@ _bt_delitems(Relation rel, Buffer buf, rdata[1].buffer_std = true; rdata[1].next = NULL; - if (isVacuum) - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata); - else - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata); + + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + + END_CRIT_SECTION(); +} + +void +_bt_delitems_delete(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems, Relation heapRel) +{ + Page page = BufferGetPage(buf); + BTPageOpaque opaque; + + Assert(nitems > 0); + + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* Fix the page */ + PageIndexMultiDelete(page, itemnos, nitems); + + /* + * We can clear the vacuum cycle ID since this page has certainly been + * processed by the current vacuum scan. + */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + opaque->btpo_cycleid = 0; + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, but + * weren't included in our target-item list), but it will almost always be + * true and it doesn't seem worth an additional page scan to check it. + * Remember that BTP_HAS_GARBAGE is only a hint anyway. + */ + opaque->btpo_flags &= ~BTP_HAS_GARBAGE; + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (!rel->rd_istemp) + { + XLogRecPtr recptr; + XLogRecData rdata[3]; + + xl_btree_delete xlrec_delete; + + xlrec_delete.node = rel->rd_node; + xlrec_delete.hnode = heapRel->rd_node; + xlrec_delete.block = BufferGetBlockNumber(buf); + xlrec_delete.nitems = nitems; + + rdata[0].data = (char *) &xlrec_delete; + rdata[0].len = SizeOfBtreeDelete; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = &(rdata[1]); + + /* + * We need the target-offsets array whether or not we store the + * to allow us to find the latestRemovedXid on a standby server. + */ + rdata[1].data = (char *) itemnos; + rdata[1].len = nitems * sizeof(OffsetNumber); + rdata[1].buffer = InvalidBuffer; + rdata[1].next = &(rdata[2]); + + rdata[2].data = NULL; + rdata[2].len = 0; + rdata[2].buffer = buf; + rdata[2].buffer_std = true; + rdata[2].next = NULL; + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 01899cfc16..0fcde95ccd 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.176 2010/02/26 02:00:34 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.177 2010/03/28 09:27:01 sriggs Exp $ * *------------------------------------------------------------------------- */ @@ -708,7 +708,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); - _bt_delitems(rel, buf, NULL, 0, true, vstate.lastBlockVacuumed); + _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } @@ -889,7 +889,7 @@ restart: { BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); - _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed); + _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Keep track of the block number of the lastBlockVacuumed, so we diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 782778cc7c..5bc710caec 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.63 2010/03/19 10:41:22 sriggs Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.64 2010/03/28 09:27:01 sriggs Exp $ * *------------------------------------------------------------------------- */ @@ -553,6 +553,139 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record) UnlockReleaseBuffer(buffer); } +/* + * Get the latestRemovedXid from the heap pages pointed at by the index + * tuples being deleted. This puts the work for calculating latestRemovedXid + * into the recovery path rather than the primary path. + * + * It's possible that this generates a fair amount of I/O, since an index + * block may have hundreds of tuples being deleted. Repeat accesses to the + * same heap blocks are common, though are not yet optimised. + * + * XXX optimise later with something like XLogPrefetchBuffer() + */ +static TransactionId +btree_xlog_delete_get_latestRemovedXid(XLogRecord *record) +{ + OffsetNumber *unused; + Buffer ibuffer, hbuffer; + Page ipage, hpage; + ItemId iitemid, hitemid; + IndexTuple itup; + HeapTupleHeader htuphdr; + BlockNumber hblkno; + OffsetNumber hoffnum; + TransactionId latestRemovedXid = InvalidTransactionId; + TransactionId htupxid = InvalidTransactionId; + int i; + int num_unused, num_redirect, num_dead; + + xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); + + /* + * Get index page + */ + ibuffer = XLogReadBuffer(xlrec->node, xlrec->block, false); + if (!BufferIsValid(ibuffer)) + return InvalidTransactionId; + ipage = (Page) BufferGetPage(ibuffer); + + /* + * Loop through the deleted index items to obtain the TransactionId + * from the heap items they point to. + */ + unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); + + for (i = 0; i < xlrec->nitems; i++) + { + /* + * Identify the index tuple about to be deleted + */ + iitemid = PageGetItemId(ipage, unused[i]); + itup = (IndexTuple) PageGetItem(ipage, iitemid); + + /* + * Locate the heap page that the index tuple points at + */ + hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); + hbuffer = XLogReadBuffer(xlrec->hnode, hblkno, false); + if (!BufferIsValid(hbuffer)) + { + UnlockReleaseBuffer(ibuffer); + return InvalidTransactionId; + } + hpage = (Page) BufferGetPage(hbuffer); + + /* + * Look up the heap tuple header that the index tuple points at + * by using the heap node supplied with the xlrec. We can't use + * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. + * Note that we are not looking at tuple data here, just headers. + */ + hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); + hitemid = PageGetItemId(hpage, hoffnum); + + /* + * Follow any redirections until we find something useful. + */ + while (ItemIdIsRedirected(hitemid)) + { + num_redirect++; + hoffnum = ItemIdGetRedirect(hitemid); + hitemid = PageGetItemId(hpage, hoffnum); + CHECK_FOR_INTERRUPTS(); + } + + /* + * If the heap item has storage, then read the header. Some LP_DEAD + * items may not be accessible, so we ignore them. + */ + if (ItemIdHasStorage(hitemid)) + { + htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); + + /* + * Get the heap tuple's xmin/xmax and ratchet up the latestRemovedXid. + * No need to consider xvac values here. + */ + htupxid = HeapTupleHeaderGetXmin(htuphdr); + if (TransactionIdFollows(htupxid, latestRemovedXid)) + latestRemovedXid = htupxid; + + htupxid = HeapTupleHeaderGetXmax(htuphdr); + if (TransactionIdFollows(htupxid, latestRemovedXid)) + latestRemovedXid = htupxid; + } + else if (ItemIdIsDead(hitemid)) + { + /* + * Conjecture: if hitemid is dead then it had xids before the xids + * marked on LP_NORMAL items. So we just ignore this item and move + * onto the next, for the purposes of calculating latestRemovedxids. + */ + num_dead++; + } + else + { + Assert(!ItemIdIsUsed(hitemid)); + num_unused++; + } + + UnlockReleaseBuffer(hbuffer); + } + + UnlockReleaseBuffer(ibuffer); + + Assert(num_unused == 0); + + /* + * Note that if all heap tuples were LP_DEAD then we will be + * returning InvalidTransactionId here. This seems very unlikely + * in practice. + */ + return latestRemovedXid; +} + static void btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) { @@ -584,12 +717,10 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record) if (record->xl_len > SizeOfBtreeDelete) { OffsetNumber *unused; - OffsetNumber *unend; unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); - unend = (OffsetNumber *) ((char *) xlrec + record->xl_len); - PageIndexMultiDelete(page, unused, unend - unused); + PageIndexMultiDelete(page, unused, xlrec->nitems); } /* @@ -830,6 +961,7 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record) * from individual btree vacuum records on that index. */ { + TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record); xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); /* @@ -839,7 +971,7 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record) * here is worth some thought and possibly some effort to * improve. */ - ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node); } break; @@ -1012,10 +1144,10 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec) { xl_btree_delete *xlrec = (xl_btree_delete *) rec; - appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u", - xlrec->node.spcNode, xlrec->node.dbNode, - xlrec->node.relNode, xlrec->block, - xlrec->latestRemovedXid); + appendStringInfo(buf, "delete: index %u/%u/%u; iblk %u, heap %u/%u/%u;", + xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, + xlrec->block, + xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode); break; } case XLOG_BTREE_DELETE_PAGE: diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e00594b487..775c47da55 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.133 2010/03/20 07:49:48 sriggs Exp $ + * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.134 2010/03/28 09:27:02 sriggs Exp $ * *------------------------------------------------------------------------- */ @@ -314,14 +314,15 @@ typedef struct xl_btree_split */ typedef struct xl_btree_delete { - RelFileNode node; + RelFileNode node; /* RelFileNode of the index */ BlockNumber block; - TransactionId latestRemovedXid; + RelFileNode hnode; /* RelFileNode of the heap the index currently points at */ + int nitems; /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ } xl_btree_delete; -#define SizeOfBtreeDelete (offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId)) +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int)) /* * This is what we need to know about page reuse within btree. @@ -349,13 +350,12 @@ typedef struct xl_btree_reuse_page * heap tuples. * * Any changes to any one block are registered on just one WAL record. All - * blocks that we need to run EnsureBlockUnpinned() before we touch the changed - * block are also given on this record as a variable length array. The array - * is compressed by way of storing an array of block ranges, rather than an - * actual array of blockids. + * blocks that we need to run EnsureBlockUnpinned() are listed as a block range + * starting from the last block vacuumed through until this one. Individual + * block numbers aren't given. * * Note that the *last* WAL record in any vacuum of an index is allowed to - * have numItems == 0. All other WAL records must have numItems > 0. + * have a zero length array of offsets. Earlier records must have at least one. */ typedef struct xl_btree_vacuum { @@ -588,9 +588,10 @@ extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, extern void _bt_relbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); -extern void _bt_delitems(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, bool isVacuum, - BlockNumber lastBlockVacuumed); +extern void _bt_delitems_delete(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems, Relation heapRel); +extern void _bt_delitems_vacuum(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed); extern int _bt_pagedel(Relation rel, Buffer buf, BTStack stack); /* diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 0787eb582c..c93e3848e8 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -11,7 +11,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlog_internal.h,v 1.30 2010/03/19 17:42:10 sriggs Exp $ + * $PostgreSQL: pgsql/src/include/access/xlog_internal.h,v 1.31 2010/03/28 09:27:02 sriggs Exp $ */ #ifndef XLOG_INTERNAL_H #define XLOG_INTERNAL_H @@ -71,7 +71,7 @@ typedef struct XLogContRecord /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0x9002 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0x9003 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData {