diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile index 889dde6a27..aabc62ff22 100644 --- a/src/backend/access/gin/Makefile +++ b/src/backend/access/gin/Makefile @@ -14,6 +14,6 @@ include $(top_builddir)/src/Makefile.global OBJS = ginutil.o gininsert.o ginxlog.o ginentrypage.o gindatapage.o \ ginbtree.o ginscan.o ginget.o ginvacuum.o ginarrayproc.o \ - ginbulk.o ginfast.o + ginbulk.o ginfast.o ginpostinglist.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index 67159d8529..434d398bf7 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -104,7 +104,7 @@ a few thousand entries can be much faster than retail insertion. (The win comes mainly from not having to do multiple searches/insertions when the same key appears in multiple new heap tuples.) -Key entries are nominally of the same IndexEntry format as used in other +Key entries are nominally of the same IndexTuple format as used in other index types, but since a leaf key entry typically refers to multiple heap tuples, there are significant differences. (See GinFormTuple, which works by building a "normal" index tuple and then modifying it.) The points to diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 2a6be4b1a9..f032faa22e 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -264,7 +264,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack, * Insert value (stored in GinBtree) to tree described by stack * * During an index build, buildStats is non-null and the counters - * it contains should be incremented as needed. + * it contains are incremented as needed. * * NB: the passed-in stack is freed, as though by freeGinBtreeStack. */ @@ -290,15 +290,15 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) { XLogRecData *rdata; BlockNumber savedRightLink; + bool fit; page = BufferGetPage(stack->buffer); savedRightLink = GinPageGetOpaque(page)->rightlink; - if (btree->isEnoughSpace(btree, stack->buffer, stack->off)) + START_CRIT_SECTION(); + fit = btree->placeToPage(btree, stack->buffer, stack->off, &rdata); + if (fit) { - START_CRIT_SECTION(); - btree->placeToPage(btree, stack->buffer, stack->off, &rdata); - MarkBufferDirty(stack->buffer); if (RelationNeedsWAL(btree->index)) @@ -318,12 +318,17 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) } else { - Buffer rbuffer = GinNewBuffer(btree->index); + /* Didn't fit, have to split */ + Buffer rbuffer; Page newlpage; + END_CRIT_SECTION(); + + rbuffer = GinNewBuffer(btree->index); + /* - * newlpage is a pointer to memory page, it doesn't associate with - * buffer, stack->buffer should be untouched + * newlpage is a pointer to memory page, it is not associated with + * a buffer. stack->buffer is not touched yet. */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, stack->off, &rdata); diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 2c6447d7fa..d67e50555c 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -15,47 +15,9 @@ #include "postgres.h" #include "access/gin_private.h" +#include "miscadmin.h" #include "utils/rel.h" -/* - * Merge two ordered arrays of itempointers, eliminating any duplicates. - * Returns the number of items in the result. - * Caller is responsible that there is enough space at *dst. - */ -uint32 -ginMergeItemPointers(ItemPointerData *dst, - ItemPointerData *a, uint32 na, - ItemPointerData *b, uint32 nb) -{ - ItemPointerData *dptr = dst; - ItemPointerData *aptr = a, - *bptr = b; - - while (aptr - a < na && bptr - b < nb) - { - int cmp = ginCompareItemPointers(aptr, bptr); - - if (cmp > 0) - *dptr++ = *bptr++; - else if (cmp == 0) - { - /* we want only one copy of the identical items */ - *dptr++ = *bptr++; - aptr++; - } - else - *dptr++ = *aptr++; - } - - while (aptr - a < na) - *dptr++ = *aptr++; - - while (bptr - b < nb) - *dptr++ = *bptr++; - - return dptr - dst; -} - /* * Checks, should we move to right link... * Compares inserting itemp pointer with right bound of current page @@ -387,9 +349,12 @@ dataPrepareData(GinBtree btree, Page page, OffsetNumber off) /* * Places keys to page and fills WAL record. In case leaf page and * build mode puts all ItemPointers to page. + * + * If none of the keys fit, returns false without modifying the page. */ -static void -dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata) +static bool +dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, + XLogRecData **prdata) { Page page = BufferGetPage(buf); int sizeofitem = GinSizeOfDataPageItem(page); @@ -399,6 +364,10 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prda static XLogRecData rdata[3]; static ginxlogInsert data; + /* quick exit if it doesn't fit */ + if (!dataIsEnoughSpace(btree, buf, off)) + return false; + *prdata = rdata; Assert(GinPageIsData(page)); @@ -464,6 +433,8 @@ dataPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prda } else GinDataPageAddPostingItem(page, &(btree->pitem), off); + + return true; } /* @@ -545,8 +516,8 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe } /* - * we suppose that during index creation table scaned from begin to end, - * so ItemPointers are monotonically increased.. + * we assume that during index creation the table scanned from beginning + * to end, so ItemPointers are in monotonically increasing order. */ if (btree->isBuild && GinPageRightMost(lpage)) separator = freeSpace / sizeofitem; @@ -575,15 +546,6 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe GinPageGetOpaque(rpage)->maxoff = maxoff - separator; - PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf)); - if (GinPageIsLeaf(lpage)) - btree->pitem.key = *GinDataPageGetItemPointer(lpage, - GinPageGetOpaque(lpage)->maxoff); - else - btree->pitem.key = GinDataPageGetPostingItem(lpage, - GinPageGetOpaque(lpage)->maxoff)->key; - btree->rightblkno = BufferGetBlockNumber(rbuf); - /* set up right bound for left page */ bound = GinDataPageGetRightBound(lpage); *bound = btree->pitem.key; @@ -613,6 +575,16 @@ dataSplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRe rdata[1].len = MAXALIGN(maxoff * sizeofitem); rdata[1].next = NULL; + /* Prepare a downlink tuple for insertion to the parent */ + PostingItemSetBlockNumber(&(btree->pitem), BufferGetBlockNumber(lbuf)); + if (GinPageIsLeaf(lpage)) + btree->pitem.key = *GinDataPageGetItemPointer(lpage, + GinPageGetOpaque(lpage)->maxoff); + else + btree->pitem.key = GinDataPageGetPostingItem(lpage, + GinPageGetOpaque(lpage)->maxoff)->key; + btree->rightblkno = BufferGetBlockNumber(rbuf); + return lpage; } @@ -638,6 +610,92 @@ ginDataFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf) GinDataPageAddPostingItem(page, &ri, InvalidOffsetNumber); } +/* + * Creates new posting tree containing the given TIDs. Returns the page + * number of the root of the new posting tree. + * + * items[] must be in sorted order with no duplicates. + */ +BlockNumber +createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, + GinStatsData *buildStats) +{ + BlockNumber blkno; + Buffer buffer; + Page page; + int itemsCount; + + /* Calculate how many TIDs will fit on first page. */ + itemsCount = Min(nitems, GinMaxLeafDataItems); + + /* + * Create the root page. + */ + buffer = GinNewBuffer(index); + page = BufferGetPage(buffer); + blkno = BufferGetBlockNumber(buffer); + + START_CRIT_SECTION(); + + GinInitBuffer(buffer, GIN_DATA | GIN_LEAF); + memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * nitems); + GinPageGetOpaque(page)->maxoff = nitems; + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(index)) + { + XLogRecPtr recptr; + XLogRecData rdata[2]; + ginxlogCreatePostingTree data; + + data.node = index->rd_node; + data.blkno = blkno; + data.nitem = nitems; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogCreatePostingTree); + rdata[0].next = &rdata[1]; + + rdata[1].buffer = InvalidBuffer; + rdata[1].data = (char *) items; + rdata[1].len = sizeof(ItemPointerData) * itemsCount; + rdata[1].next = NULL; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); + PageSetLSN(page, recptr); + } + + UnlockReleaseBuffer(buffer); + + END_CRIT_SECTION(); + + /* During index build, count the newly-added data page */ + if (buildStats) + buildStats->nDataPages++; + + /* + * Add any remaining TIDs to the newly-created posting tree. + */ + if (itemsCount < nitems) + { + GinPostingTreeScan *gdi; + + gdi = ginPrepareScanPostingTree(index, blkno, FALSE); + gdi->btree.isBuild = (buildStats != NULL); + + ginInsertItemPointers(gdi, + items + itemsCount, + nitems - itemsCount, + buildStats); + + pfree(gdi); + } + + return blkno; +} + void ginPrepareDataScan(GinBtree btree, Relation index) { @@ -650,7 +708,6 @@ ginPrepareDataScan(GinBtree btree, Relation index) btree->findItem = dataLocateLeafItem; btree->findChildPtr = dataFindChildPtr; btree->getLeftMostPage = dataGetLeftMostPage; - btree->isEnoughSpace = dataIsEnoughSpace; btree->placeToPage = dataPlaceToPage; btree->splitPage = dataSplitPage; btree->fillRoot = ginDataFillRoot; diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 7733028fba..0ed0a3db7e 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -486,9 +486,12 @@ entryPreparePage(GinBtree btree, Page page, OffsetNumber off) /* * Place tuple on page and fills WAL record + * + * If the tuple doesn't fit, returns false without modifying the page. */ -static void -entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata) +static bool +entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, + XLogRecData **prdata) { Page page = BufferGetPage(buf); OffsetNumber placed; @@ -498,6 +501,10 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd static XLogRecData rdata[3]; static ginxlogInsert data; + /* quick exit if it doesn't fit */ + if (!entryIsEnoughSpace(btree, buf, off)) + return false; + *prdata = rdata; data.updateBlkno = entryPreparePage(btree, page, off); @@ -543,6 +550,8 @@ entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prd rdata[cnt].next = NULL; btree->entry = NULL; + + return true; } /* @@ -724,7 +733,6 @@ ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum, btree->findItem = entryLocateLeafEntry; btree->findChildPtr = entryFindChildPtr; btree->getLeftMostPage = entryGetLeftMostPage; - btree->isEnoughSpace = entryIsEnoughSpace; btree->placeToPage = entryPlaceToPage; btree->splitPage = entrySplitPage; btree->fillRoot = ginEntryFillRoot; diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index beaa65317f..125f3fb12d 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -35,64 +35,6 @@ typedef struct BuildAccumulator accum; } GinBuildState; -/* - * Creates new posting tree with one page, containing the given TIDs. - * Returns the page number (which will be the root of this posting tree). - * - * items[] must be in sorted order with no duplicates. - */ -static BlockNumber -createPostingTree(Relation index, ItemPointerData *items, uint32 nitems) -{ - BlockNumber blkno; - Buffer buffer = GinNewBuffer(index); - Page page; - - /* Assert that the items[] array will fit on one page */ - Assert(nitems <= GinMaxLeafDataItems); - - START_CRIT_SECTION(); - - GinInitBuffer(buffer, GIN_DATA | GIN_LEAF); - page = BufferGetPage(buffer); - blkno = BufferGetBlockNumber(buffer); - - memcpy(GinDataPageGetData(page), items, sizeof(ItemPointerData) * nitems); - GinPageGetOpaque(page)->maxoff = nitems; - - MarkBufferDirty(buffer); - - if (RelationNeedsWAL(index)) - { - XLogRecPtr recptr; - XLogRecData rdata[2]; - ginxlogCreatePostingTree data; - - data.node = index->rd_node; - data.blkno = blkno; - data.nitem = nitems; - - rdata[0].buffer = InvalidBuffer; - rdata[0].data = (char *) &data; - rdata[0].len = sizeof(ginxlogCreatePostingTree); - rdata[0].next = &rdata[1]; - - rdata[1].buffer = InvalidBuffer; - rdata[1].data = (char *) items; - rdata[1].len = sizeof(ItemPointerData) * nitems; - rdata[1].next = NULL; - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_PTREE, rdata); - PageSetLSN(page, recptr); - } - - UnlockReleaseBuffer(buffer); - - END_CRIT_SECTION(); - - return blkno; -} - /* * Adds array of item pointers to tuple's posting list, or @@ -148,11 +90,8 @@ addItemPointersToLeafTuple(GinState *ginstate, */ postingRoot = createPostingTree(ginstate->index, GinGetPosting(old), - GinGetNPosting(old)); - - /* During index build, count the newly-added data page */ - if (buildStats) - buildStats->nDataPages++; + GinGetNPosting(old), + buildStats); /* Now insert the TIDs-to-be-added into the posting tree */ gdi = ginPrepareScanPostingTree(ginstate->index, postingRoot, FALSE); @@ -186,7 +125,7 @@ buildFreshLeafTuple(GinState *ginstate, { IndexTuple res; - /* try to build tuple with room for all the items */ + /* try to build a posting list tuple with all the items */ res = GinFormTuple(ginstate, attnum, key, category, items, nitem, false); @@ -202,32 +141,9 @@ buildFreshLeafTuple(GinState *ginstate, res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true); /* - * Initialize posting tree with as many TIDs as will fit on the first - * page. + * Initialize a new posting tree with the TIDs. */ - postingRoot = createPostingTree(ginstate->index, - items, - Min(nitem, GinMaxLeafDataItems)); - - /* During index build, count the newly-added data page */ - if (buildStats) - buildStats->nDataPages++; - - /* Add any remaining TIDs to the posting tree */ - if (nitem > GinMaxLeafDataItems) - { - GinPostingTreeScan *gdi; - - gdi = ginPrepareScanPostingTree(ginstate->index, postingRoot, FALSE); - gdi->btree.isBuild = (buildStats != NULL); - - ginInsertItemPointers(gdi, - items + GinMaxLeafDataItems, - nitem - GinMaxLeafDataItems, - buildStats); - - pfree(gdi); - } + postingRoot = createPostingTree(ginstate->index, items, nitem); /* And save the root link in the result tuple */ GinSetPostingTree(res, postingRoot); diff --git a/src/backend/access/gin/ginpostinglist.c b/src/backend/access/gin/ginpostinglist.c new file mode 100644 index 0000000000..e5a15bf2a9 --- /dev/null +++ b/src/backend/access/gin/ginpostinglist.c @@ -0,0 +1,56 @@ +/*------------------------------------------------------------------------- + * + * ginpostinglist.c + * routines for dealing with posting lists. + * + * + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/gin/ginpostinglist.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/gin_private.h" + +/* + * Merge two ordered arrays of itempointers, eliminating any duplicates. + * Returns the number of items in the result. + * Caller is responsible that there is enough space at *dst. + */ +uint32 +ginMergeItemPointers(ItemPointerData *dst, + ItemPointerData *a, uint32 na, + ItemPointerData *b, uint32 nb) +{ + ItemPointerData *dptr = dst; + ItemPointerData *aptr = a, + *bptr = b; + + while (aptr - a < na && bptr - b < nb) + { + int cmp = ginCompareItemPointers(aptr, bptr); + + if (cmp > 0) + *dptr++ = *bptr++; + else if (cmp == 0) + { + /* we want only one copy of the identical items */ + *dptr++ = *bptr++; + aptr++; + } + else + *dptr++ = *aptr++; + } + + while (aptr - a < na) + *dptr++ = *aptr++; + + while (bptr - b < nb) + *dptr++ = *bptr++; + + return dptr - dst; +} diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index bda9c60279..7b2c39965a 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -33,23 +33,26 @@ typedef struct /* - * Cleans array of ItemPointer (removes dead pointers) - * Results are always stored in *cleaned, which will be allocated - * if it's needed. In case of *cleaned!=NULL caller is responsible to - * have allocated enough space. *cleaned and items may point to the same - * memory address. + * Vacuums a list of item pointers. The original size of the list is 'nitem', + * returns the number of items remaining afterwards. + * + * If *cleaned == NULL on entry, the original array is left unmodified; if + * any items are removed, a palloc'd copy of the result is stored in *cleaned. + * Otherwise *cleaned should point to the original array, in which case it's + * modified directly. */ - -static uint32 -ginVacuumPostingList(GinVacuumState *gvs, ItemPointerData *items, uint32 nitem, ItemPointerData **cleaned) +static int +ginVacuumPostingList(GinVacuumState *gvs, ItemPointerData *items, int nitem, + ItemPointerData **cleaned) { - uint32 i, + int i, j = 0; + Assert(*cleaned == NULL || *cleaned == items); + /* * just scan over ItemPointer array */ - for (i = 0; i < nitem; i++) { if (gvs->callback(items + i, gvs->callback_state)) @@ -385,7 +388,8 @@ typedef struct DataPageDeleteStack * scans posting tree and deletes empty pages */ static bool -ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDeleteStack *parent, OffsetNumber myoff) +ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, + DataPageDeleteStack *parent, OffsetNumber myoff) { DataPageDeleteStack *me; Buffer buffer; @@ -431,15 +435,13 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDel if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber) { + /* the page is empty */ if (!(me->leftBlkno == InvalidBlockNumber && GinPageRightMost(page))) { /* we never delete right most branch */ Assert(!isRoot); - if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber) - { - ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot); - meDelete = TRUE; - } + ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot); + meDelete = TRUE; } } @@ -517,11 +519,12 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 else if (GinGetNPosting(itup) > 0) { /* - * if we already create temporary page, we will make changes in - * place + * if we already created a temporary page, make changes in place */ ItemPointerData *cleaned = (tmppage == origpage) ? NULL : GinGetPosting(itup); - uint32 newN = ginVacuumPostingList(gvs, GinGetPosting(itup), GinGetNPosting(itup), &cleaned); + int newN; + + newN = ginVacuumPostingList(gvs, GinGetPosting(itup), GinGetNPosting(itup), &cleaned); if (GinGetNPosting(itup) != newN) { @@ -530,15 +533,13 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3 GinNullCategory category; /* - * Some ItemPointers was deleted, so we should remake our - * tuple + * Some ItemPointers were deleted, recreate tuple. */ - if (tmppage == origpage) { /* - * On first difference we create temporary page in memory - * and copies content in to it. + * On first difference, create a temporary copy of the + * page and copy the tuple's posting list to it. */ tmppage = PageGetTempPageCopy(origpage); diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 7af3a870c7..c1462ee2fe 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -485,8 +485,7 @@ typedef struct GinBtreeData /* insert methods */ OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber); BlockNumber (*getLeftMostPage) (GinBtree, Page); - bool (*isEnoughSpace) (GinBtree, Buffer, OffsetNumber); - void (*placeToPage) (GinBtree, Buffer, OffsetNumber, XLogRecData **); + bool (*placeToPage) (GinBtree, Buffer, OffsetNumber, XLogRecData **); Page (*splitPage) (GinBtree, Buffer, Buffer, OffsetNumber, XLogRecData **); void (*fillRoot) (GinBtree, Buffer, Buffer, Buffer);