From 3bbd6af37cef7a46f6d3ecb8d95b58614bd76f52 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 23 Feb 2003 22:43:09 +0000 Subject: [PATCH] Adjust btbulkdelete logic so that only one WAL record is issued while deleting multiple index entries on a single index page. This makes for a very substantial reduction in the amount of WAL traffic during a large delete operation. --- src/backend/access/nbtree/nbtpage.c | 45 +++++--- src/backend/access/nbtree/nbtree.c | 155 +++++++++++++--------------- src/backend/access/nbtree/nbtxlog.c | 27 +++-- src/include/access/nbtree.h | 15 ++- 4 files changed, 135 insertions(+), 107 deletions(-) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 1643997202..a9bd393440 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.61 2003/02/23 06:17:13 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.62 2003/02/23 22:43:08 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -618,26 +618,34 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level) } /* - * Delete an item from a btree page. + * Delete item(s) from a btree page. * * This must only be used for deleting leaf items. Deleting an item on a * non-leaf page has to be done as part of an atomic action that includes * deleting the page it points to. * * This routine assumes that the caller has pinned and locked the buffer, - * and will write the buffer afterwards. + * and will write the buffer afterwards. Also, the given itemnos *must* + * appear in increasing order in the array. */ void -_bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) +_bt_delitems(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems) { Page page = BufferGetPage(buf); - OffsetNumber offno; - - offno = ItemPointerGetOffsetNumber(tid); + int i; + /* No elog(ERROR) until changes are logged */ START_CRIT_SECTION(); - PageIndexTupleDelete(page, offno); + /* + * Delete the items in reverse order so we don't have to think about + * adjusting item numbers for previous deletions. + */ + for (i = nitems - 1; i >= 0; i--) + { + PageIndexTupleDelete(page, itemnos[i]); + } /* XLOG stuff */ if (!rel->rd_istemp) @@ -646,17 +654,30 @@ _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid) XLogRecPtr recptr; XLogRecData rdata[2]; - xlrec.target.node = rel->rd_node; - xlrec.target.tid = *tid; + xlrec.node = rel->rd_node; + xlrec.block = BufferGetBlockNumber(buf); rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDelete; rdata[0].next = &(rdata[1]); + /* + * The target-offsets array is not in the buffer, but pretend + * that it is. When XLogInsert stores the whole buffer, the offsets + * array need not be stored too. + */ rdata[1].buffer = buf; - rdata[1].data = NULL; - rdata[1].len = 0; + if (nitems > 0) + { + rdata[1].data = (char *) itemnos; + rdata[1].len = nitems * sizeof(OffsetNumber); + } + else + { + rdata[1].data = NULL; + rdata[1].len = 0; + } rdata[1].next = NULL; recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b7a1e7ada1..d73af9e678 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -12,7 +12,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS) IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); void *callback_state = (void *) PG_GETARG_POINTER(2); IndexBulkDeleteResult *result; - BlockNumber num_pages; double tuples_removed; double num_index_tuples; - IndexScanDesc scan; - BTScanOpaque so; - ItemPointer current; + OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)]; + int ndeletable; + Buffer buf; + BlockNumber num_pages; tuples_removed = 0; num_index_tuples = 0; /* - * We use a standard IndexScanDesc scan object, but to speed up the - * loop, we skip most of the wrapper layers of index_getnext and - * instead call _bt_step directly. This implies holding buffer lock - * on a target page throughout the loop over the page's tuples. + * The outer loop iterates over index leaf pages, the inner over items + * on a leaf page. We issue just one _bt_delitems() call per page, + * so as to minimize WAL traffic. * - * Whenever we step onto a new page, we have to trade in the read - * lock acquired by _bt_first or _bt_step for an exclusive write lock - * (fortunately, _bt_relbuf doesn't care which kind of lock it's - * releasing when it comes time for _bt_step to release our lock). - * - * Note that we exclusive-lock every leaf page, or at least every one - * containing data items. It sounds attractive to only exclusive-lock + * Note that we exclusive-lock every leaf page containing data items, + * in sequence left to right. It sounds attractive to only exclusive-lock * those containing items we need to delete, but unfortunately that * is not safe: we could then pass a stopped indexscan, which could * in rare cases lead to deleting the item it needs to find when it * resumes. (See _bt_restscan --- this could only happen if an indexscan * stops on a deletable item and then a page split moves that item * into a page further to its right, which the indexscan will have no - * pin on.) + * pin on.) We can skip obtaining exclusive lock on empty pages + * though, since no indexscan could be stopped on those. */ - scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL); - so = (BTScanOpaque) scan->opaque; - current = &(scan->currentItemData); - - /* Use _bt_first to get started, then _bt_step to remaining tuples */ - if (_bt_first(scan, ForwardScanDirection)) + buf = _bt_get_endpoint(rel, 0, false); + if (BufferIsValid(buf)) /* check for empty index */ { - Buffer buf; - BlockNumber lockedBlock = InvalidBlockNumber; - - /* we have the buffer pinned and read-locked */ - buf = so->btso_curbuf; - Assert(BufferIsValid(buf)); - - do + for (;;) { Page page; - BlockNumber blkno; - OffsetNumber offnum; - BTItem btitem; BTPageOpaque opaque; - IndexTuple itup; - ItemPointer htup; + OffsetNumber offnum, + minoff, + maxoff; + BlockNumber nextpage; CHECK_FOR_INTERRUPTS(); - /* current is the next index tuple */ + ndeletable = 0; page = BufferGetPage(buf); - blkno = ItemPointerGetBlockNumber(current); - - /* - * Make sure we have a super-exclusive write lock on this page. - * - * We assume that only concurrent insertions, not deletions, - * can occur while we're not holding the page lock (the - * caller should hold a suitable relation lock to ensure - * this). Therefore, no items can escape being scanned because - * of this temporary lock release. - */ - if (blkno != lockedBlock) + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + /* We probably cannot see deleted pages, but skip 'em if so */ + if (minoff <= maxoff && !P_ISDELETED(opaque)) { + /* + * Trade in the initial read lock for a super-exclusive + * write lock on this page. + */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); - lockedBlock = blkno; /* - * If the page was formerly rightmost but was split while we - * didn't hold the lock, and ip_posid is pointing to item - * 1, then ip_posid now points at the high key not a valid - * data item. In this case we need to step forward. + * Recompute minoff/maxoff, both of which could have changed + * while we weren't holding the lock. */ - opaque = (BTPageOpaque) PageGetSpecialPointer(page); - if (current->ip_posid < P_FIRSTDATAKEY(opaque)) - current->ip_posid = P_FIRSTDATAKEY(opaque); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + /* + * Scan over all items to see which ones need deleted + * according to the callback function. + */ + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + BTItem btitem; + ItemPointer htup; + + btitem = (BTItem) PageGetItem(page, + PageGetItemId(page, offnum)); + htup = &(btitem->bti_itup.t_tid); + if (callback(htup, callback_state)) + { + deletable[ndeletable++] = offnum; + tuples_removed += 1; + } + else + num_index_tuples += 1; + } } - - offnum = ItemPointerGetOffsetNumber(current); - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); - itup = &btitem->bti_itup; - htup = &(itup->t_tid); - - if (callback(htup, callback_state)) + /* + * If we need to delete anything, do it and write the buffer; + * else just release the buffer. + */ + nextpage = opaque->btpo_next; + if (ndeletable > 0) { - /* Okay to delete the item from the page */ - _bt_itemdel(rel, buf, current); - - /* Mark buffer dirty, but keep the lock and pin */ - WriteNoReleaseBuffer(buf); - - tuples_removed += 1; - - /* - * We now need to back up the scan one item, so that the next - * cycle will re-examine the same offnum on this page (which - * now holds the next item). - */ - current->ip_posid--; + _bt_delitems(rel, buf, deletable, ndeletable); + _bt_wrtbuf(rel, buf); } else - num_index_tuples += 1; - } while (_bt_step(scan, &buf, ForwardScanDirection)); + { + _bt_relbuf(rel, buf); + } + /* And advance to next page, if any */ + if (nextpage == P_NONE) + break; + buf = _bt_getbuf(rel, nextpage, BT_READ); + } } - index_endscan(scan); - /* return statistics */ num_pages = RelationGetNumberOfBlocks(rel); @@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS) } } else if ((opaque->btpo_flags & BTP_HALF_DEAD) || - P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)) + P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)) { /* Empty, try to delete */ int ndel; diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 058b13b6a4..a1a52571fe 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.2 2003/02/23 06:17:13 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.3 2003/02/23 22:43:08 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -379,11 +379,10 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record) return; xlrec = (xl_btree_delete *) XLogRecGetData(record); - reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->target.node); + reln = XLogOpenRelation(redo, RM_BTREE_ID, xlrec->node); if (!RelationIsValid(reln)) return; - buffer = XLogReadBuffer(false, reln, - ItemPointerGetBlockNumber(&(xlrec->target.tid))); + buffer = XLogReadBuffer(false, reln, xlrec->block); if (!BufferIsValid(buffer)) elog(PANIC, "btree_delete_redo: block unfound"); page = (Page) BufferGetPage(buffer); @@ -396,7 +395,21 @@ btree_xlog_delete(bool redo, XLogRecPtr lsn, XLogRecord *record) return; } - PageIndexTupleDelete(page, ItemPointerGetOffsetNumber(&(xlrec->target.tid))); + if (record->xl_len > SizeOfBtreeDelete) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); + unend = (OffsetNumber *) ((char *) xlrec + record->xl_len); + + /* be careful to delete from back to front */ + while (unused < unend) + { + unend--; + PageIndexTupleDelete(page, *unend); + } + } PageSetLSN(page, lsn); PageSetSUI(page, ThisStartUpID); @@ -853,8 +866,8 @@ btree_desc(char *buf, uint8 xl_info, char *rec) { xl_btree_delete *xlrec = (xl_btree_delete *) rec; - strcat(buf, "delete: "); - out_target(buf, &(xlrec->target)); + sprintf(buf + strlen(buf), "delete: node %u/%u; blk %u", + xlrec->node.tblNode, xlrec->node.relNode, xlrec->block); break; } case XLOG_BTREE_DELETE_PAGE: diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 35b6c94e7e..d6ea70cd7d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $Id: nbtree.h,v 1.66 2003/02/23 06:17:13 tgl Exp $ + * $Id: nbtree.h,v 1.67 2003/02/23 22:43:09 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -263,14 +263,18 @@ typedef struct xl_btree_split #define SizeOfBtreeSplit (offsetof(xl_btree_split, leftlen) + sizeof(uint16)) /* - * This is what we need to know about delete of an individual leaf btitem + * This is what we need to know about delete of individual leaf btitems. + * The WAL record can represent deletion of any number of btitems on a + * single index page. */ typedef struct xl_btree_delete { - xl_btreetid target; /* deleted tuple id */ + RelFileNode node; + BlockNumber block; + /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ } xl_btree_delete; -#define SizeOfBtreeDelete (offsetof(xl_btreetid, tid) + SizeOfIptrData) +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, block) + sizeof(BlockNumber)) /* * This is what we need to know about deletion of a btree page. The target @@ -453,7 +457,8 @@ extern void _bt_wrtnorelbuf(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern bool _bt_page_recyclable(Page page); extern void _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level); -extern void _bt_itemdel(Relation rel, Buffer buf, ItemPointer tid); +extern void _bt_delitems(Relation rel, Buffer buf, + OffsetNumber *itemnos, int nitems); extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full); /*