Fix an additional set of problems in GIN's handling of lossy page pointers.

Although the key-combining code claimed to work correctly if its input
contained both lossy and exact pointers for a single page in a single TID
stream, in fact this did not work, and could not work without pretty
fundamental redesign.  Modify keyGetItem so that it will not return such a
stream, by handling lossy-pointer cases a bit more explicitly than we did
before.

Per followup investigation of a gripe from Artur Dabrowski.
An example of a query that failed given his data set is
select count(*) from search_tab where
(to_tsvector('german', keywords ) @@ to_tsquery('german', 'ee:* | dd:*')) and
(to_tsvector('german', keywords ) @@ to_tsquery('german', 'aa:*'));

Back-patch to 8.4 where the lossy pointer code was introduced.
This commit is contained in:
Tom Lane 2010-08-01 19:16:39 +00:00
parent 0454f13161
commit d4fe61b083
2 changed files with 139 additions and 68 deletions

View File

@ -8,7 +8,7 @@
* Portions Copyright (c) 1994, Regents of the University of California * Portions Copyright (c) 1994, Regents of the University of California
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.31 2010/07/31 00:30:54 tgl Exp $ * $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.32 2010/08/01 19:16:39 tgl Exp $
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
@ -49,7 +49,6 @@ findItemInPage(Page page, ItemPointer item, OffsetNumber *off)
/* /*
* scan page to find equal or first greater value * scan page to find equal or first greater value
*/ */
for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++) for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++)
{ {
res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off)); res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
@ -527,12 +526,40 @@ entryGetNextItem(Relation index, GinScanEntry entry)
} }
} }
/* convenience function for invoking a key's consistentFn */
static inline bool
callConsistentFn(GinState *ginstate, GinScanKey key)
{
/*
* Initialize recheckCurItem in case the consistentFn doesn't know it
* should set it. The safe assumption in that case is to force recheck.
*/
key->recheckCurItem = true;
return DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1],
PointerGetDatum(key->entryRes),
UInt16GetDatum(key->strategy),
key->query,
UInt32GetDatum(key->nentries),
PointerGetDatum(key->extra_data),
PointerGetDatum(&key->recheckCurItem)));
}
#define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE)) #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
#define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) ) #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
/* /*
* Sets entry->curItem to next heap item pointer for one entry of one scan key, * Sets entry->curItem to next heap item pointer for one entry of one scan key,
* or sets entry->isFinished to TRUE if there are no more. * or sets entry->isFinished to TRUE if there are no more.
*
* Item pointers must be returned in ascending order.
*
* Note: this can return a "lossy page" item pointer, indicating that the
* entry potentially matches all items on that heap page. However, it is
* not allowed to return both a lossy page pointer and exact (regular)
* item pointers for the same page. (Doing so would break the key-combination
* logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
* current implementation this is guaranteed by the behavior of tidbitmaps.
*/ */
static void static void
entryGetItem(Relation index, GinScanEntry entry) entryGetItem(Relation index, GinScanEntry entry)
@ -625,15 +652,20 @@ entryGetItem(Relation index, GinScanEntry entry)
* item pointer (including the case where the item pointer is a lossy page * item pointer (including the case where the item pointer is a lossy page
* pointer). * pointer).
* *
* Note: lossy page could be returned after single items from the same page. * Item pointers must be returned in ascending order.
* This is OK since the results will just be used to build a bitmap; we'll *
* set a bitmap entry more than once, but never actually return a row twice. * Note: this can return a "lossy page" item pointer, indicating that the
* key potentially matches all items on that heap page. However, it is
* not allowed to return both a lossy page pointer and exact (regular)
* item pointers for the same page. (Doing so would break the key-combination
* logic in scanGetItem.)
*/ */
static void static void
keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx,
GinScanKey key, ItemPointer advancePast) GinScanKey key, ItemPointer advancePast)
{ {
ItemPointerData myAdvancePast = *advancePast; ItemPointerData myAdvancePast = *advancePast;
ItemPointerData curPageLossy;
uint32 i; uint32 i;
uint32 lossyEntry; uint32 lossyEntry;
bool haveLossyEntry; bool haveLossyEntry;
@ -691,87 +723,112 @@ keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx,
myAdvancePast = key->curItem; myAdvancePast = key->curItem;
/* /*
* Prepare entryRes array to be passed to consistentFn.
*
* If key->nentries == 1 then the consistentFn should always succeed,
* but we must call it anyway to find out the recheck status.
*
* Lossy-page entries pose a problem, since we don't know the correct * Lossy-page entries pose a problem, since we don't know the correct
* entryRes state to pass to the consistentFn, and we also don't know * entryRes state to pass to the consistentFn, and we also don't know
* what its combining logic will be (could be AND, OR, or even NOT). * what its combining logic will be (could be AND, OR, or even NOT).
* Our strategy for a single lossy-page entry is to try the * If the logic is OR then the consistentFn might succeed for all
* consistentFn both ways and return a hit if it accepts either one * items in the lossy page even when none of the other entries match.
* (forcing the hit to be marked lossy so it will be rechecked). *
* If we have a single lossy-page entry then we check to see if the
* consistentFn will succeed with only that entry TRUE. If so,
* we return a lossy-page pointer to indicate that the whole heap
* page must be checked. (On the next call, we'll advance past all
* regular and lossy entries for this page before resuming search,
* thus ensuring that we never return both regular and lossy pointers
* for the same page.)
* *
* This idea could be generalized to more than one lossy-page entry, * This idea could be generalized to more than one lossy-page entry,
* but ideally lossy-page entries should be infrequent so it would * but ideally lossy-page entries should be infrequent so it would
* seldom be the case that we have more than one. If we do find more * seldom be the case that we have more than one at once. So it
* than one, we just punt and return the item as lossy. * doesn't seem worth the extra complexity to optimize that case.
* If we do find more than one, we just punt and return a lossy-page
* pointer always.
* *
* Note that only lossy-page entries pointing to the current item's * Note that only lossy-page entries pointing to the current item's
* page should trigger this processing. * page should trigger this processing; we might have future lossy
* pages in the entry array, but they aren't relevant yet.
*/ */
ItemPointerSetLossyPage(&curPageLossy,
GinItemPointerGetBlockNumber(&key->curItem));
lossyEntry = 0; lossyEntry = 0;
haveLossyEntry = false; haveLossyEntry = false;
for (i = 0; i < key->nentries; i++) for (i = 0; i < key->nentries; i++)
{ {
entry = key->scanEntry + i; entry = key->scanEntry + i;
if (entry->isFinished == FALSE &&
if (entry->isFinished) compareItemPointers(&entry->curItem, &curPageLossy) == 0)
key->entryRes[i] = FALSE;
else if (ItemPointerIsLossyPage(&entry->curItem) &&
GinItemPointerGetBlockNumber(&entry->curItem) ==
GinItemPointerGetBlockNumber(&key->curItem))
{ {
if (haveLossyEntry) if (haveLossyEntry)
{ {
/* Too many lossy entries, punt */ /* Multiple lossy entries, punt */
key->curItem = curPageLossy;
key->recheckCurItem = true; key->recheckCurItem = true;
return; return;
} }
lossyEntry = i; lossyEntry = i;
haveLossyEntry = true; haveLossyEntry = true;
/* initially assume TRUE */
key->entryRes[i] = TRUE;
} }
else if (compareItemPointers(&entry->curItem, &key->curItem) == 0) }
/* prepare for calling consistentFn in temp context */
oldCtx = MemoryContextSwitchTo(tempCtx);
if (haveLossyEntry)
{
/* Single lossy-page entry, so see if whole page matches */
memset(key->entryRes, FALSE, key->nentries);
key->entryRes[lossyEntry] = TRUE;
if (callConsistentFn(ginstate, key))
{
/* Yes, so clean up ... */
MemoryContextSwitchTo(oldCtx);
MemoryContextReset(tempCtx);
/* and return lossy pointer for whole page */
key->curItem = curPageLossy;
key->recheckCurItem = true;
return;
}
}
/*
* At this point we know that we don't need to return a lossy
* whole-page pointer, but we might have matches for individual exact
* item pointers, possibly in combination with a lossy pointer. Our
* strategy if there's a lossy pointer is to try the consistentFn both
* ways and return a hit if it accepts either one (forcing the hit to
* be marked lossy so it will be rechecked).
*
* Prepare entryRes array to be passed to consistentFn.
*
* (If key->nentries == 1 then the consistentFn should always succeed,
* but we must call it anyway to find out the recheck status.)
*/
for (i = 0; i < key->nentries; i++)
{
entry = key->scanEntry + i;
if (entry->isFinished == FALSE &&
compareItemPointers(&entry->curItem, &key->curItem) == 0)
key->entryRes[i] = TRUE; key->entryRes[i] = TRUE;
else else
key->entryRes[i] = FALSE; key->entryRes[i] = FALSE;
} }
if (haveLossyEntry)
key->entryRes[lossyEntry] = TRUE;
oldCtx = MemoryContextSwitchTo(tempCtx); res = callConsistentFn(ginstate, key);
/*
* Initialize recheckCurItem in case the consistentFn doesn't know it
* should set it. The safe assumption in that case is to force
* recheck.
*/
key->recheckCurItem = true;
res = DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1],
PointerGetDatum(key->entryRes),
UInt16GetDatum(key->strategy),
key->query,
UInt32GetDatum(key->nentries),
PointerGetDatum(key->extra_data),
PointerGetDatum(&key->recheckCurItem)));
if (!res && haveLossyEntry) if (!res && haveLossyEntry)
{ {
/* try the other way for the lossy item */ /* try the other way for the lossy item */
key->entryRes[lossyEntry] = FALSE; key->entryRes[lossyEntry] = FALSE;
key->recheckCurItem = true;
res = DatumGetBool(FunctionCall6(&ginstate->consistentFn[key->attnum - 1], res = callConsistentFn(ginstate, key);
PointerGetDatum(key->entryRes),
UInt16GetDatum(key->strategy),
key->query,
UInt32GetDatum(key->nentries),
PointerGetDatum(key->extra_data),
PointerGetDatum(&key->recheckCurItem)));
} }
/* clean up after consistentFn calls */
MemoryContextSwitchTo(oldCtx); MemoryContextSwitchTo(oldCtx);
MemoryContextReset(tempCtx); MemoryContextReset(tempCtx);
@ -1108,7 +1165,6 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
GinScanOpaque so = (GinScanOpaque) scan->opaque; GinScanOpaque so = (GinScanOpaque) scan->opaque;
MemoryContext oldCtx; MemoryContext oldCtx;
bool recheck, bool recheck,
keyrecheck,
match; match;
int i; int i;
pendingPosition pos; pendingPosition pos;
@ -1152,8 +1208,8 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
continue; continue;
/* /*
* Matching of entries of one row is finished, so check row by * Matching of entries of one row is finished, so check row using
* consistent function. * consistent functions.
*/ */
oldCtx = MemoryContextSwitchTo(so->tempCtx); oldCtx = MemoryContextSwitchTo(so->tempCtx);
recheck = false; recheck = false;
@ -1163,21 +1219,12 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
{ {
GinScanKey key = so->keys + i; GinScanKey key = so->keys + i;
keyrecheck = true; if (!callConsistentFn(&so->ginstate, key))
if (!DatumGetBool(FunctionCall6(&so->ginstate.consistentFn[key->attnum - 1],
PointerGetDatum(key->entryRes),
UInt16GetDatum(key->strategy),
key->query,
UInt32GetDatum(key->nentries),
PointerGetDatum(key->extra_data),
PointerGetDatum(&keyrecheck))))
{ {
match = false; match = false;
break; break;
} }
recheck |= key->recheckCurItem;
recheck |= keyrecheck;
} }
MemoryContextSwitchTo(oldCtx); MemoryContextSwitchTo(oldCtx);
@ -1248,11 +1295,26 @@ scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
Assert(!ItemPointerIsMax(item)); Assert(!ItemPointerIsMax(item));
/* /*----------
* Now *item contains first ItemPointer after previous result. * Now *item contains first ItemPointer after previous result.
* *
* The item is a valid hit only if all the keys returned either * The item is a valid hit only if all the keys returned either
* that exact TID, or a lossy reference to the same page. * that exact TID, or a lossy reference to the same page.
*
* This logic works only if a keyGetItem stream can never contain both
* exact and lossy pointers for the same page. Else we could have a
* case like
*
* stream 1 stream 2
* ... ...
* 42/6 42/7
* 50/1 42/0xffff
* ... ...
*
* We would conclude that 42/6 is not a match and advance stream 1,
* thus never detecting the match to the lossy pointer in stream 2.
* (keyGetItem has a similar problem versus entryGetItem.)
*----------
*/ */
match = true; match = true;
for (i = 0; i < so->nkeys; i++) for (i = 0; i < so->nkeys; i++)

View File

@ -4,7 +4,7 @@
* *
* Copyright (c) 2006-2010, PostgreSQL Global Development Group * Copyright (c) 2006-2010, PostgreSQL Global Development Group
* *
* $PostgreSQL: pgsql/src/include/access/gin.h,v 1.40 2010/08/01 02:12:42 tgl Exp $ * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.41 2010/08/01 19:16:39 tgl Exp $
*-------------------------------------------------------------------------- *--------------------------------------------------------------------------
*/ */
#ifndef GIN_H #ifndef GIN_H
@ -107,13 +107,22 @@ typedef struct GinMetaPageData
* We use our own ItemPointerGet(BlockNumber|GetOffsetNumber) * We use our own ItemPointerGet(BlockNumber|GetOffsetNumber)
* to avoid Asserts, since sometimes the ip_posid isn't "valid" * to avoid Asserts, since sometimes the ip_posid isn't "valid"
*/ */
#define GinItemPointerGetBlockNumber(pointer) \ #define GinItemPointerGetBlockNumber(pointer) \
BlockIdGetBlockNumber(&(pointer)->ip_blkid) BlockIdGetBlockNumber(&(pointer)->ip_blkid)
#define GinItemPointerGetOffsetNumber(pointer) \ #define GinItemPointerGetOffsetNumber(pointer) \
((pointer)->ip_posid) ((pointer)->ip_posid)
/*
* Special-case item pointer values needed by the GIN search logic.
* MIN: sorts less than any valid item pointer
* MAX: sorts greater than any valid item pointer
* LOSSY PAGE: indicates a whole heap page, sorts after normal item
* pointers for that page
* Note that these are all distinguishable from an "invalid" item pointer
* (which is InvalidBlockNumber/0) as well as from all normal item
* pointers (which have item numbers in the range 1..MaxHeapTuplesPerPage).
*/
#define ItemPointerSetMin(p) \ #define ItemPointerSetMin(p) \
ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0) ItemPointerSet((p), (BlockNumber)0, (OffsetNumber)0)
#define ItemPointerIsMin(p) \ #define ItemPointerIsMin(p) \