diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 0e499598a4..4edc5a75f2 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -36,6 +36,7 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, bool unlockbuf, bool unlockleftchild); static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, List *splitinfo, bool releasebuf); +static void gistvacuumpage(Relation rel, Page page, Buffer buffer); #define ROTATEDIST(d) do { \ @@ -209,6 +210,17 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + + /* + * If leaf page is full, try at first to delete dead tuples. And then + * check again. + */ + if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) + { + gistvacuumpage(rel, page, buffer); + is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); + } + if (is_split) { /* no space for insertion */ @@ -1440,3 +1452,73 @@ freeGISTstate(GISTSTATE *giststate) /* It's sufficient to delete the scanCxt */ MemoryContextDelete(giststate->scanCxt); } + +/* + * gistvacuumpage() -- try to remove LP_DEAD items from the given page. + * Function assumes that buffer is exclusively locked. + */ +static void +gistvacuumpage(Relation rel, Page page, Buffer buffer) +{ + OffsetNumber deletable[MaxIndexTuplesPerPage]; + int ndeletable = 0; + OffsetNumber offnum, maxoff; + + Assert(GistPageIsLeaf(page)); + + /* + * Scan over all items to see which ones need to be deleted according to + * LP_DEAD flags. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsDead(itemId)) + deletable[ndeletable++] = offnum; + } + + if (ndeletable > 0) + { + START_CRIT_SECTION(); + + PageIndexMultiDelete(page, deletable, ndeletable); + + /* + * Mark the page as not containing any LP_DEAD items. This is not + * certainly true (there might be some that have recently been marked, + * but weren't included in our target-item list), but it will almost + * always be true and it doesn't seem worth an additional page scan to + * check it. Remember that F_HAS_GARBAGE is only a hint anyway. + */ + GistClearPageHasGarbage(page); + + MarkBufferDirty(buffer); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogUpdate(rel->rd_node, buffer, + deletable, ndeletable, + NULL, 0, InvalidBuffer); + + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + } + + /* + * Note: if we didn't find any LP_DEAD items, then the page's + * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a + * separate write to clear it, however. We will clear it when we split + * the page. + */ +} diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 20f695cee4..473ae430dd 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -24,6 +24,77 @@ #include "utils/memutils.h" #include "utils/rel.h" +/* + * gistkillitems() -- set LP_DEAD state for items an indexscan caller has + * told us were killed. + * + * We re-read page here, so it's important to check page LSN. If the page + * has been modified since the last read (as determined by LSN), we cannot + * flag any entries because it is possible that the old entry was vacuumed + * away and the TID was re-used by a completely different heap tuple. + */ +static void +gistkillitems(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + Buffer buffer; + Page page; + OffsetNumber offnum; + ItemId iid; + int i; + bool killedsomething = false; + + Assert(so->curBlkno != InvalidBlockNumber); + Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); + Assert(so->killedItems != NULL); + + buffer = ReadBuffer(scan->indexRelation, so->curBlkno); + if (!BufferIsValid(buffer)) + return; + + LockBuffer(buffer, GIST_SHARE); + gistcheckpage(scan->indexRelation, buffer); + page = BufferGetPage(buffer); + + /* + * If page LSN differs it means that the page was modified since the last read. + * killedItems could be not valid so LP_DEAD hints applying is not safe. + */ + if(PageGetLSN(page) != so->curPageLSN) + { + UnlockReleaseBuffer(buffer); + so->numKilled = 0; /* reset counter */ + return; + } + + Assert(GistPageIsLeaf(page)); + + /* + * Mark all killedItems as dead. We need no additional recheck, + * because, if page was modified, pageLSN must have changed. + */ + for (i = 0; i < so->numKilled; i++) + { + offnum = so->killedItems[i]; + iid = PageGetItemId(page, offnum); + ItemIdMarkDead(iid); + killedsomething = true; + } + + if (killedsomething) + { + GistMarkPageHasGarbage(page); + MarkBufferDirtyHint(buffer, true); + } + + UnlockReleaseBuffer(buffer); + + /* + * Always reset the scan state, so we don't look for same items on other + * pages. + */ + so->numKilled = 0; +} /* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? @@ -305,17 +376,33 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->curPageLSN = PageGetLSN(page); + /* * check all tuples on page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { - IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + ItemId iid = PageGetItemId(page, i); + IndexTuple it; bool match; bool recheck; bool recheck_distances; + /* + * If the scan specifies not to return killed tuples, then we treat a + * killed tuple as not passing the qual. + */ + if(scan->ignore_killed_tuples && ItemIdIsDead(iid)) + continue; + + it = (IndexTuple) PageGetItem(page, iid); /* * Must call gistindex_keytest in tempCxt, and clean up any leftover * junk afterward. @@ -348,6 +435,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, */ so->pageData[so->nPageData].heapPtr = it->t_tid; so->pageData[so->nPageData].recheck = recheck; + so->pageData[so->nPageData].offnum = i; /* * In an index-only scan, also fetch the data from the tuple. @@ -572,7 +660,24 @@ gistgettuple(PG_FUNCTION_ARGS) { if (so->curPageData < so->nPageData) { + if (scan->kill_prior_tuple && so->curPageData > 0) + { + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } /* continuing to return tuples from a leaf page */ scan->xs_ctup.t_self = so->pageData[so->curPageData].heapPtr; scan->xs_recheck = so->pageData[so->curPageData].recheck; @@ -586,9 +691,36 @@ gistgettuple(PG_FUNCTION_ARGS) PG_RETURN_BOOL(true); } + /* + * Check the last returned tuple and add it to killitems if + * necessary + */ + if (scan->kill_prior_tuple + && so->curPageData > 0 + && so->curPageData == so->nPageData) + { + + if (so->killedItems == NULL) + { + MemoryContext oldCxt = + MemoryContextSwitchTo(so->giststate->scanCxt); + + so->killedItems = + (OffsetNumber *) palloc(MaxIndexTuplesPerPage + * sizeof(OffsetNumber)); + + MemoryContextSwitchTo(oldCxt); + } + if (so->numKilled < MaxIndexTuplesPerPage) + so->killedItems[so->numKilled++] = + so->pageData[so->curPageData - 1].offnum; + } /* find and process the next index page */ do { + if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) + gistkillitems(scan); + GISTSearchItem *item = getNextGISTSearchItem(so); if (!item) @@ -596,6 +728,9 @@ gistgettuple(PG_FUNCTION_ARGS) CHECK_FOR_INTERRUPTS(); + /* save current item BlockNumber for next gistkillitems() call */ + so->curBlkno = item->blkno; + /* * While scanning a leaf page, ItemPointers of matching heap * tuples are stored in so->pageData. If there are any on diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index ad39294875..a17c5bc564 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -93,6 +93,11 @@ gistbeginscan(PG_FUNCTION_ARGS) memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); } + so->killedItems = NULL; /* until needed */ + so->numKilled = 0; + so->curBlkno = InvalidBlockNumber; + so->curPageLSN = InvalidXLogRecPtr; + scan->opaque = so; /* diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 81e559bc2d..ea3a3b01f4 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -41,8 +41,11 @@ */ #define F_LEAF (1 << 0) /* leaf page */ #define F_DELETED (1 << 1) /* the page has been deleted */ -#define F_TUPLES_DELETED (1 << 2) /* some tuples on the page are dead */ +#define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were + * deleted */ #define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */ +#define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead, + * but not deleted yet */ typedef XLogRecPtr GistNSN; @@ -137,6 +140,10 @@ typedef struct GISTENTRY #define GistMarkTuplesDeleted(page) ( GistPageGetOpaque(page)->flags |= F_TUPLES_DELETED) #define GistClearTuplesDeleted(page) ( GistPageGetOpaque(page)->flags &= ~F_TUPLES_DELETED) +#define GistPageHasGarbage(page) ( GistPageGetOpaque(page)->flags & F_HAS_GARBAGE) +#define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE) +#define GistClearPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE) + #define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT) #define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT) #define GistClearFollowRight(page) ( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT) diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 4f1a5c33ea..1a77982391 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -22,6 +22,7 @@ #include "storage/bufmgr.h" #include "storage/buffile.h" #include "utils/hsearch.h" +#include "access/genam.h" /* * Maximum number of "halves" a page can be split into in one operation. @@ -121,9 +122,11 @@ typedef struct GISTSearchHeapItem { ItemPointerData heapPtr; bool recheck; /* T if quals must be rechecked */ - bool recheckDistances; /* T if distances must be rechecked */ + bool recheckDistances; /* T if distances must be rechecked */ IndexTuple ftup; /* data fetched back from the index, used in * index-only scans */ + OffsetNumber offnum; /* track offset in page to mark tuple as + * LP_DEAD */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -161,6 +164,12 @@ typedef struct GISTScanOpaqueData /* pre-allocated workspace arrays */ double *distances; /* output area for gistindex_keytest */ + /* info about killed items if any (killedItems is NULL if never used) */ + OffsetNumber *killedItems; /* offset numbers of killed items */ + int numKilled; /* number of currently stored items */ + BlockNumber curBlkno; /* current number of block */ + GistNSN curPageLSN; /* pos in the WAL stream when page was read */ + /* In a non-ordered search, returnable heap items are stored here: */ GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; OffsetNumber nPageData; /* number of valid items in array */