diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index f0ff91aba2..9b57b774e5 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -17,6 +17,7 @@ #include "access/gin_private.h" #include "access/xloginsert.h" #include "miscadmin.h" +#include "utils/memutils.h" #include "utils/rel.h" static void ginFindParents(GinBtree btree, GinBtreeStack *stack); @@ -310,15 +311,16 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack) * Insert a new item to a page. * * Returns true if the insertion was finished. On false, the page was split and - * the parent needs to be updated. (a root split returns true as it doesn't - * need any further action by the caller to complete) + * the parent needs to be updated. (A root split returns true as it doesn't + * need any further action by the caller to complete.) * * When inserting a downlink to an internal page, 'childbuf' contains the * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared - * atomically with the insert. Also, the existing item at the given location - * is updated to point to 'updateblkno'. + * atomically with the insert. Also, the existing item at offset stack->off + * in the target page is updated to point to updateblkno. * * stack->buffer is locked on entry, and is kept locked. + * Likewise for childbuf, if given. */ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, @@ -326,11 +328,28 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, Buffer childbuf, GinStatsData *buildStats) { Page page = BufferGetPage(stack->buffer); + bool result; GinPlaceToPageRC rc; uint16 xlflags = 0; Page childpage = NULL; Page newlpage = NULL, newrpage = NULL; + void *ptp_workspace = NULL; + MemoryContext tmpCxt; + MemoryContext oldCxt; + + /* + * We do all the work of this function and its subfunctions in a temporary + * memory context. This avoids leakages and simplifies APIs, since some + * subfunctions allocate storage that has to survive until we've finished + * the WAL insertion. + */ + tmpCxt = AllocSetContextCreate(CurrentMemoryContext, + "ginPlaceToPage temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + oldCxt = MemoryContextSwitchTo(tmpCxt); if (GinPageIsData(page)) xlflags |= GIN_INSERT_ISDATA; @@ -348,40 +367,42 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } /* - * Try to put the incoming tuple on the page. placeToPage will decide if - * the page needs to be split. - * - * WAL-logging this operation is a bit funny: - * - * We're responsible for calling XLogBeginInsert() and XLogInsert(). - * XLogBeginInsert() must be called before placeToPage, because - * placeToPage can register some data to the WAL record. - * - * If placeToPage returns INSERTED, placeToPage has already called - * START_CRIT_SECTION() and XLogBeginInsert(), and registered any data - * required to replay the operation, in block index 0. We're responsible - * for filling in the main data portion of the WAL record, calling - * XLogInsert(), and END_CRIT_SECTION. - * - * If placeToPage returns SPLIT, we're wholly responsible for WAL logging. - * Splits happen infrequently, so we just make a full-page image of all - * the pages involved. + * See if the incoming tuple will fit on the page. beginPlaceToPage will + * decide if the page needs to be split, and will compute the split + * contents if so. See comments for beginPlaceToPage and execPlaceToPage + * functions for more details of the API here. */ - rc = btree->placeToPage(btree, stack->buffer, stack, - insertdata, updateblkno, - &newlpage, &newrpage); - if (rc == UNMODIFIED) + rc = btree->beginPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, + &ptp_workspace, + &newlpage, &newrpage); + + if (rc == GPTP_NO_WORK) { - XLogResetInsertion(); - return true; + /* Nothing to do */ + result = true; } - else if (rc == INSERTED) + else if (rc == GPTP_INSERT) { - /* placeToPage did START_CRIT_SECTION() */ + /* It will fit, perform the insertion */ + START_CRIT_SECTION(); + + if (RelationNeedsWAL(btree->index)) + { + XLogBeginInsert(); + XLogRegisterBuffer(0, stack->buffer, REGBUF_STANDARD); + if (BufferIsValid(childbuf)) + XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD); + } + + /* Perform the page update, and register any extra WAL data */ + btree->execPlaceToPage(btree, stack->buffer, stack, + insertdata, updateblkno, ptp_workspace); + MarkBufferDirty(stack->buffer); /* An insert to an internal page finishes the split of the child. */ - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); @@ -393,21 +414,15 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, ginxlogInsert xlrec; BlockIdData childblknos[2]; - /* - * placetopage already registered stack->buffer as block 0. - */ xlrec.flags = xlflags; - if (childbuf != InvalidBuffer) - XLogRegisterBuffer(1, childbuf, REGBUF_STANDARD); - XLogRegisterData((char *) &xlrec, sizeof(ginxlogInsert)); /* * Log information about child if this was an insertion of a * downlink. */ - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) { BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); @@ -417,23 +432,29 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT); PageSetLSN(page, recptr); - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); - return true; + /* Insertion is complete. */ + result = true; } - else if (rc == SPLIT) + else if (rc == GPTP_SPLIT) { - /* Didn't fit, had to split */ + /* + * Didn't fit, need to split. The split has been computed in newlpage + * and newrpage, which are pointers to palloc'd pages, not associated + * with buffers. stack->buffer is not touched yet. + */ Buffer rbuffer; BlockNumber savedRightLink; ginxlogSplit data; Buffer lbuffer = InvalidBuffer; Page newrootpg = NULL; + /* Get a new index page to become the right page */ rbuffer = GinNewBuffer(btree->index); /* During index build, count the new page */ @@ -447,19 +468,11 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, savedRightLink = GinPageGetOpaque(page)->rightlink; - /* - * newlpage and newrpage are pointers to memory pages, not associated - * with buffers. stack->buffer is not touched yet. - */ - + /* Begin setting up WAL record */ data.node = btree->index->rd_node; data.flags = xlflags; - if (childbuf != InvalidBuffer) + if (BufferIsValid(childbuf)) { - Page childpage = BufferGetPage(childbuf); - - GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; - data.leftChildBlkno = BufferGetBlockNumber(childbuf); data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; } @@ -469,12 +482,12 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, if (stack->parent == NULL) { /* - * split root, so we need to allocate new left page and place - * pointer on root to left and right page + * splitting the root, so we need to allocate new left page and + * place pointers to left and right page on root page. */ lbuffer = GinNewBuffer(btree->index); - /* During index build, count the newly-added root page */ + /* During index build, count the new left page */ if (buildStats) { if (btree->isData) @@ -491,9 +504,9 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, /* * Construct a new root page containing downlinks to the new left - * and right pages. (do this in a temporary copy first rather than - * overwriting the original page directly, so that we can still - * abort gracefully if this fails.) + * and right pages. (Do this in a temporary copy rather than + * overwriting the original page directly, since we're not in the + * critical section yet.) */ newrootpg = PageGetTempPage(newrpage); GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); @@ -504,7 +517,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } else { - /* split non-root page */ + /* splitting a non-root page */ data.rrlink = savedRightLink; GinPageGetOpaque(newrpage)->rightlink = savedRightLink; @@ -513,41 +526,44 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, } /* - * Ok, we have the new contents of the left page in a temporary copy - * now (newlpage), and the newly-allocated right block has been filled - * in. The original page is still unchanged. + * OK, we have the new contents of the left page in a temporary copy + * now (newlpage), and likewise for the new contents of the + * newly-allocated right block. The original page is still unchanged. * * If this is a root split, we also have a temporary page containing - * the new contents of the root. Copy the new left page to a - * newly-allocated block, and initialize the (original) root page the - * new copy. Otherwise, copy over the temporary copy of the new left - * page over the old left page. + * the new contents of the root. */ START_CRIT_SECTION(); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); - if (BufferIsValid(childbuf)) - MarkBufferDirty(childbuf); /* - * Restore the temporary copies over the real buffers. But don't free - * the temporary copies yet, WAL record data points to them. + * Restore the temporary copies over the real buffers. */ if (stack->parent == NULL) { + /* Splitting the root, three pages to update */ MarkBufferDirty(lbuffer); - memcpy(BufferGetPage(stack->buffer), newrootpg, BLCKSZ); + memcpy(page, newrootpg, BLCKSZ); memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } else { - memcpy(BufferGetPage(stack->buffer), newlpage, BLCKSZ); + /* Normal split, only two pages to update */ + memcpy(page, newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } + /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ + if (BufferIsValid(childbuf)) + { + GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; + MarkBufferDirty(childbuf); + } + /* write WAL record */ if (RelationNeedsWAL(btree->index)) { @@ -572,12 +588,13 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, XLogRegisterBuffer(1, rbuffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); } if (BufferIsValid(childbuf)) - XLogRegisterBuffer(3, childbuf, 0); + XLogRegisterBuffer(3, childbuf, REGBUF_STANDARD); XLogRegisterData((char *) &data, sizeof(ginxlogSplit)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT); - PageSetLSN(BufferGetPage(stack->buffer), recptr); + + PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(rbuffer), recptr); if (stack->parent == NULL) PageSetLSN(BufferGetPage(lbuffer), recptr); @@ -587,33 +604,31 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, END_CRIT_SECTION(); /* - * We can release the lock on the right page now, but keep the - * original buffer locked. + * We can release the locks/pins on the new pages now, but keep + * stack->buffer locked. childbuf doesn't get unlocked either. */ UnlockReleaseBuffer(rbuffer); if (stack->parent == NULL) UnlockReleaseBuffer(lbuffer); - pfree(newlpage); - pfree(newrpage); - if (newrootpg) - pfree(newrootpg); - /* * If we split the root, we're done. Otherwise the split is not * complete until the downlink for the new page has been inserted to * the parent. */ - if (stack->parent == NULL) - return true; - else - return false; + result = (stack->parent == NULL); } else { - elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc); - return false; /* keep compiler quiet */ + elog(ERROR, "invalid return code from GIN placeToPage method: %d", rc); + result = false; /* keep compiler quiet */ } + + /* Clean up temp context */ + MemoryContextSwitchTo(oldCxt); + MemoryContextDelete(tmpCxt); + + return result; } /* diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index ec8c94bcbd..feac59d9e0 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -18,7 +18,6 @@ #include "access/xloginsert.h" #include "lib/ilist.h" #include "miscadmin.h" -#include "utils/memutils.h" #include "utils/rel.h" /* @@ -57,6 +56,13 @@ typedef struct int rsize; /* total size on right page */ bool oldformat; /* page is in pre-9.4 format on disk */ + + /* + * If we need WAL data representing the reconstructed leaf page, it's + * stored here by computeLeafRecompressWALData. + */ + char *walinfo; /* buffer start */ + int walinfolen; /* and length */ } disassembledLeaf; typedef struct @@ -105,10 +111,9 @@ static bool leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining); static bool addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems); -static void registerLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf); +static void computeLeafRecompressWALData(disassembledLeaf *leaf); static void dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf); -static void dataPlaceToPageLeafSplit(Buffer buf, - disassembledLeaf *leaf, +static void dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, Page lpage, Page rpage); @@ -423,11 +428,22 @@ GinPageDeletePostingItem(Page page, OffsetNumber offset) } /* - * Places keys to leaf data page and fills WAL record. + * Prepare to insert data on a leaf data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. */ static GinPlaceToPageRC -dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, Page *newlpage, Page *newrpage) +dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, + void **ptp_workspace, + Page *newlpage, Page *newrpage) { GinBtreeDataLeafInsertData *items = insertdata; ItemPointer newItems = &items->items[items->curitem]; @@ -440,15 +456,11 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, bool append; int segsize; Size freespace; - MemoryContext tmpCxt; - MemoryContext oldCxt; disassembledLeaf *leaf; leafSegmentInfo *lastleftinfo; ItemPointerData maxOldItem; ItemPointerData remaining; - Assert(GinPageIsData(page)); - rbound = *GinDataPageGetRightBound(page); /* @@ -472,18 +484,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, maxitems = i; } - /* - * The following operations do quite a lot of small memory allocations, - * create a temporary memory context so that we don't need to keep track - * of them individually. - */ - tmpCxt = AllocSetContextCreate(CurrentMemoryContext, - "Gin split temporary context", - ALLOCSET_DEFAULT_MINSIZE, - ALLOCSET_DEFAULT_INITSIZE, - ALLOCSET_DEFAULT_MAXSIZE); - oldCxt = MemoryContextSwitchTo(tmpCxt); - + /* Disassemble the data on the page */ leaf = disassembleLeaf(page); /* @@ -548,16 +549,13 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, maxitems = Min(maxitems, nnewsegments * MinTuplesPerSegment); } - /* Add the new items to the segments */ + /* Add the new items to the segment list */ if (!addItemsToLeaf(leaf, newItems, maxitems)) { /* all items were duplicates, we have nothing to do */ items->curitem += maxitems; - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(tmpCxt); - - return UNMODIFIED; + return GPTP_NO_WORK; } /* @@ -590,22 +588,17 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (!needsplit) { /* - * Great, all the items fit on a single page. Construct a WAL record - * describing the changes we made, and write the segments back to the - * page. - * - * Once we start modifying the page, there's no turning back. The - * caller is responsible for calling END_CRIT_SECTION() after writing - * the WAL record. + * Great, all the items fit on a single page. If needed, prepare data + * for a WAL record describing the changes we'll make. */ - MemoryContextSwitchTo(oldCxt); if (RelationNeedsWAL(btree->index)) - { - XLogBeginInsert(); - registerLeafRecompressWALData(buf, leaf); - } - START_CRIT_SECTION(); - dataPlaceToPageLeafRecompress(buf, leaf); + computeLeafRecompressWALData(leaf); + + /* + * We're ready to enter the critical section, but + * dataExecPlaceToPageLeaf will need access to the "leaf" data. + */ + *ptp_workspace = leaf; if (append) elog(DEBUG2, "appended %d new items to block %u; %d bytes (%d to go)", @@ -619,7 +612,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, else { /* - * Had to split. + * Have to split. * * leafRepackItems already divided the segments between the left and * the right page. It filled the left page as full as possible, and @@ -631,7 +624,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, * until they're balanced. * * As a further heuristic, when appending items to the end of the - * page, try make the left page 75% full, one the assumption that + * page, try to make the left page 75% full, on the assumption that * subsequent insertions will probably also go to the end. This packs * the index somewhat tighter when appending to a table, which is very * common. @@ -680,10 +673,13 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, &lastleftinfo->nitems); lbound = lastleftinfo->items[lastleftinfo->nitems - 1]; - *newlpage = MemoryContextAlloc(oldCxt, BLCKSZ); - *newrpage = MemoryContextAlloc(oldCxt, BLCKSZ); + /* + * Now allocate a couple of temporary page images, and fill them. + */ + *newlpage = palloc(BLCKSZ); + *newrpage = palloc(BLCKSZ); - dataPlaceToPageLeafSplit(buf, leaf, lbound, rbound, + dataPlaceToPageLeafSplit(leaf, lbound, rbound, *newlpage, *newrpage); Assert(GinPageRightMost(page) || @@ -700,12 +696,31 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, items->nitem - items->curitem - maxitems); } - MemoryContextSwitchTo(oldCxt); - MemoryContextDelete(tmpCxt); - items->curitem += maxitems; - return needsplit ? SPLIT : INSERTED; + return needsplit ? GPTP_SPLIT : GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +dataExecPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, void *ptp_workspace) +{ + disassembledLeaf *leaf = (disassembledLeaf *) ptp_workspace; + + /* Apply changes to page */ + dataPlaceToPageLeafRecompress(buf, leaf); + + /* If needed, register WAL data built by computeLeafRecompressWALData */ + if (RelationNeedsWAL(btree->index)) + { + XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); + } } /* @@ -816,11 +831,11 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) } if (RelationNeedsWAL(indexrel)) - { - XLogBeginInsert(); - registerLeafRecompressWALData(buffer, leaf); - } + computeLeafRecompressWALData(leaf); + + /* Apply changes to page */ START_CRIT_SECTION(); + dataPlaceToPageLeafRecompress(buffer, leaf); MarkBufferDirty(buffer); @@ -829,6 +844,9 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) { XLogRecPtr recptr; + XLogBeginInsert(); + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, leaf->walinfo, leaf->walinfolen); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_VACUUM_DATA_LEAF_PAGE); PageSetLSN(page, recptr); } @@ -839,10 +857,11 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs) /* * Construct a ginxlogRecompressDataLeaf record representing the changes - * in *leaf. + * in *leaf. (Because this requires a palloc, we have to do it before + * we enter the critical section that actually updates the page.) */ static void -registerLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) +computeLeafRecompressWALData(disassembledLeaf *leaf) { int nmodified = 0; char *walbufbegin; @@ -933,18 +952,15 @@ registerLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf) segno++; } - - XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - XLogRegisterBufData(0, walbufbegin, walbufend - walbufbegin); - + /* Pass back the constructed info via *leaf */ + leaf->walinfo = walbufbegin; + leaf->walinfolen = walbufend - walbufbegin; } /* * Assemble a disassembled posting tree leaf page back to a buffer. * - * *prdata is filled with WAL information about this operation. The caller - * is responsible for inserting to the WAL, along with any other information - * about the operation that triggered this recompression. + * This just updates the target buffer; WAL stuff is caller's responsibility. * * NOTE: The segment pointers must not point directly to the same buffer, * except for segments that have not been modified and whose preceding @@ -1003,11 +1019,11 @@ dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf) * segments to two pages instead of one. * * This is different from the non-split cases in that this does not modify - * the original page directly, but to temporary in-memory copies of the new - * left and right pages. + * the original page directly, but writes to temporary in-memory copies of + * the new left and right pages. */ static void -dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, +dataPlaceToPageLeafSplit(disassembledLeaf *leaf, ItemPointerData lbound, ItemPointerData rbound, Page lpage, Page rpage) { @@ -1076,39 +1092,55 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf, } /* - * Place a PostingItem to page, and fill a WAL record. + * Prepare to insert data on an internal data page. * - * If the item doesn't fit, returns false without modifying the page. + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. * - * In addition to inserting the given item, the downlink of the existing item - * at 'off' is updated to point to 'updateblkno'. + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. * - * On INSERTED, registers the buffer as buffer ID 0, with data. - * On SPLIT, returns rdata that represents the split pages in *prdata. + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. */ static GinPlaceToPageRC -dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, BlockNumber updateblkno, - Page *newlpage, Page *newrpage) +dataBeginPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) { Page page = BufferGetPage(buf); - OffsetNumber off = stack->off; - PostingItem *pitem; - /* this must be static so it can be returned to caller */ - static ginxlogInsertDataInternal data; - - /* split if we have to */ + /* If it doesn't fit, deal with split case */ if (GinNonLeafDataPageGetFreeSpace(page) < sizeof(PostingItem)) { dataSplitPageInternal(btree, buf, stack, insertdata, updateblkno, newlpage, newrpage); - return SPLIT; + return GPTP_SPLIT; } - Assert(GinPageIsData(page)); + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} - START_CRIT_SECTION(); +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +dataExecPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace) +{ + Page page = BufferGetPage(buf); + OffsetNumber off = stack->off; + PostingItem *pitem; /* Update existing downlink to point to next page (on internal page) */ pitem = GinDataPageGetPostingItem(page, off); @@ -1120,43 +1152,90 @@ dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (RelationNeedsWAL(btree->index)) { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertDataInternal data; + data.offset = off; data.newitem = *pitem; - XLogBeginInsert(); - XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) &data, sizeof(ginxlogInsertDataInternal)); } - - return INSERTED; } /* - * Places an item (or items) to a posting tree. Calls relevant function of - * internal of leaf page because they are handled very differently. + * Prepare to insert data on a posting-tree data page. + * + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. + * + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. + * + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. */ static GinPlaceToPageRC -dataPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertdata, BlockNumber updateblkno, - Page *newlpage, Page *newrpage) +dataBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) { Page page = BufferGetPage(buf); Assert(GinPageIsData(page)); if (GinPageIsLeaf(page)) - return dataPlaceToPageLeaf(btree, buf, stack, insertdata, - newlpage, newrpage); + return dataBeginPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace, + newlpage, newrpage); else - return dataPlaceToPageInternal(btree, buf, stack, - insertdata, updateblkno, - newlpage, newrpage); + return dataBeginPlaceToPageInternal(btree, buf, stack, + insertdata, updateblkno, + ptp_workspace, + newlpage, newrpage); } /* - * Split page and fill WAL record. Returns a new temp buffer filled with data - * that should go to the left page. The original buffer is left untouched. + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + * + * Calls relevant function for internal or leaf page because they are handled + * very differently. + */ +static void +dataExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertdata, BlockNumber updateblkno, + void *ptp_workspace) +{ + Page page = BufferGetPage(buf); + + if (GinPageIsLeaf(page)) + dataExecPlaceToPageLeaf(btree, buf, stack, insertdata, + ptp_workspace); + else + dataExecPlaceToPageInternal(btree, buf, stack, insertdata, + updateblkno, ptp_workspace); +} + +/* + * Split internal page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. */ static void dataSplitPageInternal(GinBtree btree, Buffer origbuf, @@ -1231,6 +1310,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, /* set up right bound for right page */ *GinDataPageGetRightBound(rpage) = oldbound; + /* return temp pages to caller */ *newlpage = lpage; *newrpage = rpage; } @@ -1789,7 +1869,8 @@ ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno) btree->isMoveRight = dataIsMoveRight; btree->findItem = NULL; btree->findChildPtr = dataFindChildPtr; - btree->placeToPage = dataPlaceToPage; + btree->beginPlaceToPage = dataBeginPlaceToPage; + btree->execPlaceToPage = dataExecPlaceToPage; btree->fillRoot = ginDataFillRoot; btree->prepareDownlink = dataPrepareDownlink; diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index c912e60a11..a022f50ffa 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -21,7 +21,7 @@ static void entrySplitPage(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, - void *insertPayload, + GinBtreeEntryInsertData *insertData, BlockNumber updateblkno, Page *newlpage, Page *newrpage); @@ -508,40 +508,58 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off, } /* - * Place tuple on page and fills WAL record + * Prepare to insert data on an entry page. * - * If the tuple doesn't fit, returns false without modifying the page. + * If it will fit, return GPTP_INSERT after doing whatever setup is needed + * before we enter the insertion critical section. *ptp_workspace can be + * set to pass information along to the execPlaceToPage function. * - * On insertion to an internal node, in addition to inserting the given item, - * the downlink of the existing item at 'off' is updated to point to - * 'updateblkno'. + * If it won't fit, perform a page split and return two temporary page + * images into *newlpage and *newrpage, with result GPTP_SPLIT. * - * On INSERTED, registers the buffer as buffer ID 0, with data. - * On SPLIT, returns rdata that represents the split pages in *prdata. + * In neither case should the given page buffer be modified here. + * + * Note: on insertion to an internal node, in addition to inserting the given + * item, the downlink of the existing item at stack->off will be updated to + * point to updateblkno. */ static GinPlaceToPageRC -entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, - void *insertPayload, BlockNumber updateblkno, - Page *newlpage, Page *newrpage) +entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void **ptp_workspace, + Page *newlpage, Page *newrpage) +{ + GinBtreeEntryInsertData *insertData = insertPayload; + OffsetNumber off = stack->off; + + /* If it doesn't fit, deal with split case */ + if (!entryIsEnoughSpace(btree, buf, off, insertData)) + { + entrySplitPage(btree, buf, stack, insertData, updateblkno, + newlpage, newrpage); + return GPTP_SPLIT; + } + + /* Else, we're ready to proceed with insertion */ + return GPTP_INSERT; +} + +/* + * Perform data insertion after beginPlaceToPage has decided it will fit. + * + * This is invoked within a critical section, and XLOG record creation (if + * needed) is already started. The target buffer is registered in slot 0. + */ +static void +entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, + void *insertPayload, BlockNumber updateblkno, + void *ptp_workspace) { GinBtreeEntryInsertData *insertData = insertPayload; Page page = BufferGetPage(buf); OffsetNumber off = stack->off; OffsetNumber placed; - /* this must be static so it can be returned to caller. */ - static ginxlogInsertEntry data; - - /* quick exit if it doesn't fit */ - if (!entryIsEnoughSpace(btree, buf, off, insertData)) - { - entrySplitPage(btree, buf, stack, insertPayload, updateblkno, - newlpage, newrpage); - return SPLIT; - } - - START_CRIT_SECTION(); - entryPreparePage(btree, page, off, insertData, updateblkno); placed = PageAddItem(page, @@ -554,34 +572,36 @@ entryPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack, if (RelationNeedsWAL(btree->index)) { + /* + * This must be static, because it has to survive until XLogInsert, + * and we can't palloc here. Ugly, but the XLogInsert infrastructure + * isn't reentrant anyway. + */ + static ginxlogInsertEntry data; + data.isDelete = insertData->isDelete; data.offset = off; - XLogBeginInsert(); - XLogRegisterBuffer(0, buf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) &data, offsetof(ginxlogInsertEntry, tuple)); XLogRegisterBufData(0, (char *) insertData->entry, IndexTupleSize(insertData->entry)); } - - return INSERTED; } /* - * Place tuple and split page, original buffer(lbuf) leaves untouched, - * returns shadow pages filled with new data. - * Tuples are distributed between pages by equal size on its, not - * an equal number! + * Split entry page and insert new data. + * + * Returns new temp pages to *newlpage and *newrpage. + * The original buffer is left untouched. */ static void entrySplitPage(GinBtree btree, Buffer origbuf, GinBtreeStack *stack, - void *insertPayload, + GinBtreeEntryInsertData *insertData, BlockNumber updateblkno, Page *newlpage, Page *newrpage) { - GinBtreeEntryInsertData *insertData = insertPayload; OffsetNumber off = stack->off; OffsetNumber i, maxoff, @@ -646,6 +666,10 @@ entrySplitPage(GinBtree btree, Buffer origbuf, { itup = (IndexTuple) ptr; + /* + * Decide where to split. We try to equalize the pages' total data + * size, not number of tuples. + */ if (lsize > totalsize / 2) { if (separator == InvalidOffsetNumber) @@ -663,6 +687,7 @@ entrySplitPage(GinBtree btree, Buffer origbuf, ptr += MAXALIGN(IndexTupleSize(itup)); } + /* return temp pages to caller */ *newlpage = lpage; *newrpage = rpage; } @@ -731,7 +756,8 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, btree->isMoveRight = entryIsMoveRight; btree->findItem = entryLocateLeafEntry; btree->findChildPtr = entryFindChildPtr; - btree->placeToPage = entryPlaceToPage; + btree->beginPlaceToPage = entryBeginPlaceToPage; + btree->execPlaceToPage = entryExecPlaceToPage; btree->fillRoot = ginEntryFillRoot; btree->prepareDownlink = entryPrepareDownlink; diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 06c57807be..fce9e22160 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -420,14 +420,14 @@ typedef struct ginxlogCreatePostingTree typedef struct { - uint16 flags; /* GIN_SPLIT_ISLEAF and/or GIN_SPLIT_ISDATA */ + uint16 flags; /* GIN_INSERT_ISLEAF and/or GIN_INSERT_ISDATA */ /* * FOLLOWS: * * 1. if not leaf page, block numbers of the left and right child pages - * whose split this insertion finishes. As BlockIdData[2] (beware of - * adding fields before this that would make them not 16-bit aligned) + * whose split this insertion finishes, as BlockIdData[2] (beware of + * adding fields in this struct that would make them not 16-bit aligned) * * 2. a ginxlogInsertEntry or ginxlogRecompressDataLeaf struct, depending * on tree type. @@ -499,21 +499,19 @@ typedef struct ginxlogSplit * split */ BlockNumber leftChildBlkno; /* valid on a non-leaf split */ BlockNumber rightChildBlkno; - uint16 flags; - - /* follows: one of the following structs */ + uint16 flags; /* see below */ } ginxlogSplit; /* * Flags used in ginxlogInsert and ginxlogSplit records */ #define GIN_INSERT_ISDATA 0x01 /* for both insert and split records */ -#define GIN_INSERT_ISLEAF 0x02 /* .. */ +#define GIN_INSERT_ISLEAF 0x02 /* ditto */ #define GIN_SPLIT_ROOT 0x04 /* only for split records */ /* * Vacuum simply WAL-logs the whole page, when anything is modified. This - * functionally identical heap_newpage records, but is kept separate for + * is functionally identical to heap_newpage records, but is kept separate for * debugging purposes. (When inspecting the WAL stream, it's easier to see * what's going on when GIN vacuum records are marked as such, not as heap * records.) This is currently only used for entry tree leaf pages. @@ -637,12 +635,12 @@ typedef struct GinBtreeStack typedef struct GinBtreeData *GinBtree; -/* Return codes for GinBtreeData.placeToPage method */ +/* Return codes for GinBtreeData.beginPlaceToPage method */ typedef enum { - UNMODIFIED, - INSERTED, - SPLIT + GPTP_NO_WORK, + GPTP_INSERT, + GPTP_SPLIT } GinPlaceToPageRC; typedef struct GinBtreeData @@ -655,7 +653,8 @@ typedef struct GinBtreeData /* insert methods */ OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber); - GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, Page *, Page *); + GinPlaceToPageRC (*beginPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void **, Page *, Page *); + void (*execPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void *); void *(*prepareDownlink) (GinBtree, Buffer); void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page);