From d4fe61b08353c98b636255645a76709f3051df34 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 1 Aug 2010 19:16:39 +0000 Subject: [PATCH] Fix an additional set of problems in GIN's handling of lossy page pointers. Although the key-combining code claimed to work correctly if its input contained both lossy and exact pointers for a single page in a single TID stream, in fact this did not work, and could not work without pretty fundamental redesign. Modify keyGetItem so that it will not return such a stream, by handling lossy-pointer cases a bit more explicitly than we did before. Per followup investigation of a gripe from Artur Dabrowski. An example of a query that failed given his data set is select count(*) from search_tab where (to_tsvector('german', keywords ) @@ to_tsquery('german', 'ee:* | dd:*')) and (to_tsvector('german', keywords ) @@ to_tsquery('german', 'aa:*')); Back-patch to 8.4 where the lossy pointer code was introduced. --- src/backend/access/gin/ginget.c | 194 +++++++++++++++++++++----------- src/include/access/gin.h | 13 ++- 2 files changed, 139 insertions(+), 68 deletions(-) diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 7547bc3891..eb6f5d1eb0 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.31 2010/07/31 00:30:54 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.32 2010/08/01 19:16:39 tgl Exp $ *------------------------------------------------------------------------- */ @@ -43,13 +43,12 @@ findItemInPage(Page page, ItemPointer item, OffsetNumber *off) int res; if (GinPageGetOpaque(page)->flags & GIN_DELETED) - /* page was deleted by concurrent vacuum */ + /* page was deleted by concurrent vacuum */ return false; /* * scan page to find equal or first greater value */ - for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++) { res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off)); @@ -527,12 +526,40 @@ entryGetNextItem(Relation index, GinScanEntry entry) } } +/* convenience function for invoking a key's consistentFn */ +static inline bool +callConsistentFn(GinState *ginstate, GinScanKey key) +{ + /* + * Initialize recheckCurItem in case the consistentFn doesn't know it + * should set it. The safe assumption in that case is to force recheck. + */ + key->recheckCurItem = true; + + return DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1], + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + UInt32GetDatum(key->nentries), + PointerGetDatum(key->extra_data), + PointerGetDatum(&key->recheckCurItem))); +} + #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) /* * Sets entry->curItem to next heap item pointer for one entry of one scan key, * or sets entry->isFinished to TRUE if there are no more. + * + * Item pointers must be returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * entry potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the + * current implementation this is guaranteed by the behavior of tidbitmaps. */ static void entryGetItem(Relation index, GinScanEntry entry) @@ -625,15 +652,20 @@ entryGetItem(Relation index, GinScanEntry entry) * item pointer (including the case where the item pointer is a lossy page * pointer). * - * Note: lossy page could be returned after single items from the same page. - * This is OK since the results will just be used to build a bitmap; we'll - * set a bitmap entry more than once, but never actually return a row twice. + * Item pointers must be returned in ascending order. + * + * Note: this can return a "lossy page" item pointer, indicating that the + * key potentially matches all items on that heap page. However, it is + * not allowed to return both a lossy page pointer and exact (regular) + * item pointers for the same page. (Doing so would break the key-combination + * logic in scanGetItem.) */ static void keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, GinScanKey key, ItemPointer advancePast) { ItemPointerData myAdvancePast = *advancePast; + ItemPointerData curPageLossy; uint32 i; uint32 lossyEntry; bool haveLossyEntry; @@ -691,87 +723,112 @@ keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, myAdvancePast = key->curItem; /* - * Prepare entryRes array to be passed to consistentFn. - * - * If key->nentries == 1 then the consistentFn should always succeed, - * but we must call it anyway to find out the recheck status. - * * Lossy-page entries pose a problem, since we don't know the correct * entryRes state to pass to the consistentFn, and we also don't know * what its combining logic will be (could be AND, OR, or even NOT). - * Our strategy for a single lossy-page entry is to try the - * consistentFn both ways and return a hit if it accepts either one - * (forcing the hit to be marked lossy so it will be rechecked). + * If the logic is OR then the consistentFn might succeed for all + * items in the lossy page even when none of the other entries match. + * + * If we have a single lossy-page entry then we check to see if the + * consistentFn will succeed with only that entry TRUE. If so, + * we return a lossy-page pointer to indicate that the whole heap + * page must be checked. (On the next call, we'll advance past all + * regular and lossy entries for this page before resuming search, + * thus ensuring that we never return both regular and lossy pointers + * for the same page.) * * This idea could be generalized to more than one lossy-page entry, * but ideally lossy-page entries should be infrequent so it would - * seldom be the case that we have more than one. If we do find more - * than one, we just punt and return the item as lossy. + * seldom be the case that we have more than one at once. So it + * doesn't seem worth the extra complexity to optimize that case. + * If we do find more than one, we just punt and return a lossy-page + * pointer always. * * Note that only lossy-page entries pointing to the current item's - * page should trigger this processing. + * page should trigger this processing; we might have future lossy + * pages in the entry array, but they aren't relevant yet. */ + ItemPointerSetLossyPage(&curPageLossy, + GinItemPointerGetBlockNumber(&key->curItem)); + lossyEntry = 0; haveLossyEntry = false; for (i = 0; i < key->nentries; i++) { entry = key->scanEntry + i; - - if (entry->isFinished) - key->entryRes[i] = FALSE; - else if (ItemPointerIsLossyPage(&entry->curItem) && - GinItemPointerGetBlockNumber(&entry->curItem) == - GinItemPointerGetBlockNumber(&key->curItem)) + if (entry->isFinished == FALSE && + compareItemPointers(&entry->curItem, &curPageLossy) == 0) { if (haveLossyEntry) { - /* Too many lossy entries, punt */ + /* Multiple lossy entries, punt */ + key->curItem = curPageLossy; key->recheckCurItem = true; return; } lossyEntry = i; haveLossyEntry = true; - /* initially assume TRUE */ - key->entryRes[i] = TRUE; } - else if (compareItemPointers(&entry->curItem, &key->curItem) == 0) + } + + /* prepare for calling consistentFn in temp context */ + oldCtx = MemoryContextSwitchTo(tempCtx); + + if (haveLossyEntry) + { + /* Single lossy-page entry, so see if whole page matches */ + memset(key->entryRes, FALSE, key->nentries); + key->entryRes[lossyEntry] = TRUE; + + if (callConsistentFn(ginstate, key)) + { + /* Yes, so clean up ... */ + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(tempCtx); + + /* and return lossy pointer for whole page */ + key->curItem = curPageLossy; + key->recheckCurItem = true; + return; + } + } + + /* + * At this point we know that we don't need to return a lossy + * whole-page pointer, but we might have matches for individual exact + * item pointers, possibly in combination with a lossy pointer. Our + * strategy if there's a lossy pointer is to try the consistentFn both + * ways and return a hit if it accepts either one (forcing the hit to + * be marked lossy so it will be rechecked). + * + * Prepare entryRes array to be passed to consistentFn. + * + * (If key->nentries == 1 then the consistentFn should always succeed, + * but we must call it anyway to find out the recheck status.) + */ + for (i = 0; i < key->nentries; i++) + { + entry = key->scanEntry + i; + if (entry->isFinished == FALSE && + compareItemPointers(&entry->curItem, &key->curItem) == 0) key->entryRes[i] = TRUE; else key->entryRes[i] = FALSE; } + if (haveLossyEntry) + key->entryRes[lossyEntry] = TRUE; - oldCtx = MemoryContextSwitchTo(tempCtx); - - /* - * Initialize recheckCurItem in case the consistentFn doesn't know it - * should set it. The safe assumption in that case is to force - * recheck. - */ - key->recheckCurItem = true; - - res = DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1], - PointerGetDatum(key->entryRes), - UInt16GetDatum(key->strategy), - key->query, - UInt32GetDatum(key->nentries), - PointerGetDatum(key->extra_data), - PointerGetDatum(&key->recheckCurItem))); + res = callConsistentFn(ginstate, key); if (!res && haveLossyEntry) { /* try the other way for the lossy item */ key->entryRes[lossyEntry] = FALSE; - key->recheckCurItem = true; - res = DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1], - PointerGetDatum(key->entryRes), - UInt16GetDatum(key->strategy), - key->query, - UInt32GetDatum(key->nentries), - PointerGetDatum(key->extra_data), - PointerGetDatum(&key->recheckCurItem))); + res = callConsistentFn(ginstate, key); } + /* clean up after consistentFn calls */ MemoryContextSwitchTo(oldCtx); MemoryContextReset(tempCtx); @@ -1108,7 +1165,6 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) GinScanOpaque so = (GinScanOpaque) scan->opaque; MemoryContext oldCtx; bool recheck, - keyrecheck, match; int i; pendingPosition pos; @@ -1152,8 +1208,8 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) continue; /* - * Matching of entries of one row is finished, so check row by - * consistent function. + * Matching of entries of one row is finished, so check row using + * consistent functions. */ oldCtx = MemoryContextSwitchTo(so->tempCtx); recheck = false; @@ -1163,21 +1219,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids) { GinScanKey key = so->keys + i; - keyrecheck = true; - - if (!DatumGetBool(FunctionCall6(&so->ginstate.consistentFn[key->attnum - 1], - PointerGetDatum(key->entryRes), - UInt16GetDatum(key->strategy), - key->query, - UInt32GetDatum(key->nentries), - PointerGetDatum(key->extra_data), - PointerGetDatum(&keyrecheck)))) + if (!callConsistentFn(&so->ginstate, key)) { match = false; break; } - - recheck |= keyrecheck; + recheck |= key->recheckCurItem; } MemoryContextSwitchTo(oldCtx); @@ -1248,11 +1295,26 @@ scanGetItem(IndexScanDesc scan, ItemPointer advancePast, Assert(!ItemPointerIsMax(item)); - /* + /*---------- * Now *item contains first ItemPointer after previous result. * * The item is a valid hit only if all the keys returned either * that exact TID, or a lossy reference to the same page. + * + * This logic works only if a keyGetItem stream can never contain both + * exact and lossy pointers for the same page. Else we could have a + * case like + * + * stream 1 stream 2 + * ... ... + * 42/6 42/7 + * 50/1 42/0xffff + * ... ... + * + * We would conclude that 42/6 is not a match and advance stream 1, + * thus never detecting the match to the lossy pointer in stream 2. + * (keyGetItem has a similar problem versus entryGetItem.) + *---------- */ match = true; for (i = 0; i < so->nkeys; i++) diff --git a/src/include/access/gin.h b/src/include/access/gin.h index d584573c5d..279ff13a01 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -4,7 +4,7 @@ * * Copyright (c) 2006-2010, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.40 2010/08/01 02:12:42 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.41 2010/08/01 19:16:39 tgl Exp $ *-------------------------------------------------------------------------- */ #ifndef GIN_H @@ -107,13 +107,22 @@ typedef struct GinMetaPageData * We use our own ItemPointerGet(BlockNumber|GetOffsetNumber) * to avoid Asserts, since sometimes the ip_posid isn't "valid" */ - #define GinItemPointerGetBlockNumber(pointer) \ BlockIdGetBlockNumber(&(pointer)->ip_blkid) #define GinItemPointerGetOffsetNumber(pointer) \ ((pointer)->ip_posid) +/* + * Special-case item pointer values needed by the GIN search logic. + * MIN: sorts less than any valid item pointer + * MAX: sorts greater than any valid item pointer + * LOSSY PAGE: indicates a whole heap page, sorts after normal item + * pointers for that page + * Note that these are all distinguishable from an "invalid" item pointer + * (which is InvalidBlockNumber/0) as well as from all normal item + * pointers (which have item numbers in the range 1..MaxHeapTuplesPerPage). + */ #define ItemPointerSetMin(p) \ ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0) #define ItemPointerIsMin(p) \