From d2e5e20e57111cca9e14f6e5a99a186d4c66a5b7 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Fri, 3 Jan 2020 12:18:13 -0800 Subject: [PATCH] Add xl_btree_delete optimization. Commit 558a9165e08 taught _bt_delitems_delete() to produce its own XID horizon on the primary. Standbys no longer needed to generate their own latestRemovedXid, since they could just use the explicitly logged value from the primary instead. The deleted offset numbers array from the xl_btree_delete WAL record was no longer used by the REDO routine for anything other than deleting the items. This enables a minor optimization: We now treat the array as buffer state, not generic WAL data, following _bt_delitems_vacuum()'s example. This should be a minor win, since it allows us to avoid including the deleted items array in cases where XLogInsert() stores the whole buffer anyway. The primary goal here is to make the code more maintainable, though. Removing inessential differences between the two functions highlights the fundamental differences that remain. Also change xl_btree_delete to use uint32 for the size of the array of item offsets being deleted. This brings xl_btree_delete closer to xl_btree_vacuum. Furthermore, it seems like a good idea to use an explicit-width integer type (the field was previously an "int"). Bump XLOG_PAGE_MAGIC because xl_btree_delete changed. Discussion: https://postgr.es/m/CAH2-Wzkz4TjmezzfAbaV1zYrh=fr0bCpzuJTvBe5iUQ3aUPsCQ@mail.gmail.com --- src/backend/access/nbtree/nbtpage.c | 48 ++++++++++++--------------- src/backend/access/nbtree/nbtxlog.c | 11 ++---- src/backend/access/rmgrdesc/nbtdesc.c | 4 +-- src/include/access/nbtree.h | 3 +- src/include/access/nbtxlog.h | 6 ++-- src/include/access/xlog_internal.h | 2 +- 6 files changed, 32 insertions(+), 42 deletions(-) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 73d28d37a3..f05cbe7467 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -961,20 +961,15 @@ _bt_page_recyclable(Page page) } /* - * Delete item(s) from a btree page during VACUUM. - * - * This must only be used for deleting leaf items. Deleting an item on a - * non-leaf page has to be done as part of an atomic action that includes - * deleting the page it points to. + * Delete item(s) from a btree leaf page during VACUUM. * * This routine assumes that the caller has a super-exclusive write lock on * the buffer. Also, the given deletable array *must* be sorted in ascending * order. * * We record VACUUMs and b-tree deletes differently in WAL. Deletes must - * generate recovery conflicts by accessing the heap inline, whereas VACUUMs - * can rely on the initial heap scan taking care of the problem (pruning would - * have generated the conflicts needed for hot standby already). + * generate their own latestRemovedXid by accessing the heap directly, whereas + * VACUUMs rely on the initial heap scan taking care of it indirectly. */ void _bt_delitems_vacuum(Relation rel, Buffer buf, @@ -1030,9 +1025,9 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, XLogRegisterData((char *) &xlrec_vacuum, SizeOfBtreeVacuum); /* - * The target-offsets array is not in the buffer, but pretend that it - * is. When XLogInsert stores the whole buffer, the offsets array - * need not be stored too. + * The deletable array is not in the buffer, but pretend that it is. + * When XLogInsert stores the whole buffer, the array need not be + * stored too. */ XLogRegisterBufData(0, (char *) deletable, ndeletable * sizeof(OffsetNumber)); @@ -1046,21 +1041,19 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, } /* - * Delete item(s) from a btree page during single-page cleanup. - * - * As above, must only be used on leaf pages. + * Delete item(s) from a btree leaf page during single-page cleanup. * * This routine assumes that the caller has pinned and write locked the - * buffer. Also, the given itemnos *must* appear in increasing order in the - * array. + * buffer. Also, the given deletable array *must* be sorted in ascending + * order. * * This is nearly the same as _bt_delitems_vacuum as far as what it does to - * the page, but it needs to generate its own recovery conflicts by accessing - * the heap. See comments for _bt_delitems_vacuum. + * the page, but it needs to generate its own latestRemovedXid by accessing + * the heap. This is used by the REDO routine to generate recovery conflicts. */ void _bt_delitems_delete(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, + OffsetNumber *deletable, int ndeletable, Relation heapRel) { Page page = BufferGetPage(buf); @@ -1068,18 +1061,18 @@ _bt_delitems_delete(Relation rel, Buffer buf, TransactionId latestRemovedXid = InvalidTransactionId; /* Shouldn't be called unless there's something to do */ - Assert(nitems > 0); + Assert(ndeletable > 0); if (XLogStandbyInfoActive() && RelationNeedsWAL(rel)) latestRemovedXid = index_compute_xid_horizon_for_tuples(rel, heapRel, buf, - itemnos, nitems); + deletable, ndeletable); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* Fix the page */ - PageIndexMultiDelete(page, itemnos, nitems); + PageIndexMultiDelete(page, deletable, ndeletable); /* * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID, @@ -1098,18 +1091,19 @@ _bt_delitems_delete(Relation rel, Buffer buf, xl_btree_delete xlrec_delete; xlrec_delete.latestRemovedXid = latestRemovedXid; - xlrec_delete.nitems = nitems; + xlrec_delete.ndeleted = ndeletable; XLogBeginInsert(); XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterData((char *) &xlrec_delete, SizeOfBtreeDelete); /* - * We need the target-offsets array whether or not we store the whole - * buffer, to allow us to find the latestRemovedXid on a standby - * server. + * The deletable array is not in the buffer, but pretend that it is. + * When XLogInsert stores the whole buffer, the array need not be + * stored too. */ - XLogRegisterData((char *) itemnos, nitems * sizeof(OffsetNumber)); + XLogRegisterBufData(0, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE); diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index e1c3749148..2e5202c2d6 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -449,16 +449,11 @@ btree_xlog_delete(XLogReaderState *record) */ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { + char *ptr = XLogRecGetBlockData(record, 0, NULL); + page = (Page) BufferGetPage(buffer); - if (XLogRecGetDataLen(record) > SizeOfBtreeDelete) - { - OffsetNumber *unused; - - unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); - - PageIndexMultiDelete(page, unused, xlrec->nitems); - } + PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted); /* Mark the page as not containing any LP_DEAD items */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index e0ec8a4b0b..7d63a7124e 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -53,8 +53,8 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_delete *xlrec = (xl_btree_delete *) rec; - appendStringInfo(buf, "%d items, latest removed xid %u", - xlrec->nitems, xlrec->latestRemovedXid); + appendStringInfo(buf, "latestRemovedXid %u; ndeleted %u", + xlrec->latestRemovedXid, xlrec->ndeleted); break; } case XLOG_BTREE_MARK_PAGE_HALFDEAD: diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 4f84ca83dc..f90ee3a0e0 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -779,7 +779,8 @@ extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable); extern void _bt_delitems_delete(Relation rel, Buffer buf, - OffsetNumber *itemnos, int nitems, Relation heapRel); + OffsetNumber *deletable, int ndeletable, + Relation heapRel); extern int _bt_pagedel(Relation rel, Buffer buf); /* diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 3da5514655..776a9bd723 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -126,12 +126,12 @@ typedef struct xl_btree_split typedef struct xl_btree_delete { TransactionId latestRemovedXid; - int nitems; + uint32 ndeleted; - /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ + /* DELETED TARGET OFFSET NUMBERS FOLLOW */ } xl_btree_delete; -#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int)) +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, ndeleted) + sizeof(uint32)) /* * This is what we need to know about page reuse within btree. This record diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 0a836d1c92..087918d41d 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD103 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD104 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData {