diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index b68daa55ae..76ac0fcddd 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -809,7 +809,8 @@ amrestrpos (IndexScanDesc scan); Size -amestimateparallelscan (void); +amestimateparallelscan (int nkeys, + int norderbys); Estimate and return the number of bytes of dynamic shared memory which the access method will be needed to perform a parallel scan. (This number @@ -817,6 +818,13 @@ amestimateparallelscan (void); AM-independent data in ParallelIndexScanDescData.) + + The nkeys and norderbys + parameters indicate the number of quals and ordering operators that will be + used in the scan; the same values will be passed to amrescan. + Note that the actual values of the scan keys aren't provided yet. + + It is not necessary to implement this function for access methods which do not support parallel scans or for which the number of additional bytes diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index e1e96ba7c4..053da8d6e4 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4064,6 +4064,19 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + Queries that use certain SQL constructs to search for + rows matching any value out of a list or array of multiple scalar values + (see ) perform multiple + primitive index scans (up to one primitive scan per scalar + value) during query execution. Each internal primitive index scan + increments pg_stat_all_indexes.idx_scan, + so it's possible for the count of index scans to significantly exceed the + total number of index scan executor node executions. + + + diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 78ac3b1abb..7510159fc8 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -449,13 +449,10 @@ index_restrpos(IndexScanDesc scan) /* * index_parallelscan_estimate - estimate shared memory for parallel scan - * - * Currently, we don't pass any information to the AM-specific estimator, - * so it can probably only return a constant. In the future, we might need - * to pass more information. */ Size -index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot) +index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys, + Snapshot snapshot) { Size nbytes; @@ -474,7 +471,8 @@ index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot) */ if (indexRelation->rd_indam->amestimateparallelscan != NULL) nbytes = add_size(nbytes, - indexRelation->rd_indam->amestimateparallelscan()); + indexRelation->rd_indam->amestimateparallelscan(nkeys, + norderbys)); return nbytes; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 41df1027d2..686a3206f7 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -40,6 +40,9 @@ /* * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. * + * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the + * scan to advance it via another call to _bt_first. + * * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to * a new page; others must wait. * @@ -47,11 +50,11 @@ * to a new page; some process can start doing that. * * BTPARALLEL_DONE indicates that the scan is complete (including error exit). - * We reach this state once for every distinct combination of array keys. */ typedef enum { BTPARALLEL_NOT_INITIALIZED, + BTPARALLEL_NEED_PRIMSCAN, BTPARALLEL_ADVANCING, BTPARALLEL_IDLE, BTPARALLEL_DONE, @@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ - int btps_arrayKeyCount; /* count indicating number of array scan - * keys processed by parallel scan */ - slock_t btps_mutex; /* protects above variables */ + slock_t btps_mutex; /* protects above variables, btps_arrElems */ ConditionVariable btps_cv; /* used to synchronize parallel scan */ + + /* + * btps_arrElems is used when scans need to schedule another primitive + * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys. + */ + int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]; } BTParallelScanDescData; typedef struct BTParallelScanDescData *BTParallelScanDesc; @@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* btree indexes are never lossy */ scan->xs_recheck = false; - /* - * If we have any array keys, initialize them during first call for a - * scan. We can't do this in btrescan because we don't know the scan - * direction at that time. - */ - if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) - { - /* punt if we have any unsatisfiable array keys */ - if (so->numArrayKeys < 0) - return false; - - _bt_start_array_keys(scan, dir); - } - - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* @@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* If we have a tuple, return it ... */ if (res) break; - /* ... otherwise see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + /* ... otherwise see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir)); return res; } @@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) int64 ntids = 0; ItemPointer heapTid; - /* - * If we have any array keys, initialize them. - */ - if (so->numArrayKeys) - { - /* punt if we have any unsatisfiable array keys */ - if (so->numArrayKeys < 0) - return ntids; - - _bt_start_array_keys(scan, ForwardScanDirection); - } - - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* Fetch the first page & tuple */ @@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) ntids++; } } - /* Now see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); + /* Now see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection)); return ntids; } @@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys) else so->keyData = NULL; - so->arrayKeyData = NULL; /* assume no array keys for now */ - so->arraysStarted = false; - so->numArrayKeys = 0; + so->needPrimScan = false; + so->scanBehind = false; so->arrayKeys = NULL; + so->orderProcs = NULL; so->arrayContext = NULL; so->killedItems = NULL; /* until needed */ @@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, } so->markItemIndex = -1; - so->arrayKeyCount = 0; + so->needPrimScan = false; + so->scanBehind = false; BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); @@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ - - /* If any keys are SK_SEARCHARRAY type, set up array-key info */ - _bt_preprocess_array_keys(scan); + so->numArrayKeys = 0; /* ditto */ } /* @@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan) /* Release storage */ if (so->keyData != NULL) pfree(so->keyData); - /* so->arrayKeyData and so->arrayKeys are in arrayContext */ + /* so->arrayKeys and so->orderProcs are in arrayContext */ if (so->arrayContext != NULL) MemoryContextDelete(so->arrayContext); if (so->killedItems != NULL) @@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan) BTScanPosInvalidate(so->markPos); so->markItemIndex = -1; } - - /* Also record the current positions of any array keys */ - if (so->numArrayKeys) - _bt_mark_array_keys(scan); } /* @@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* Restore the marked positions of any array keys */ - if (so->numArrayKeys) - _bt_restore_array_keys(scan); - if (so->markItemIndex >= 0) { /* @@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan) if (so->currTuples) memcpy(so->currTuples, so->markTuples, so->markPos.nextTupleOffset); + /* Reset the scan's array keys (see _bt_steppage for why) */ + if (so->numArrayKeys) + { + _bt_start_array_keys(scan, so->currPos.dir); + so->needPrimScan = false; + } } else BTScanPosInvalidate(so->currPos); @@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan) * btestimateparallelscan -- estimate storage for BTParallelScanDescData */ Size -btestimateparallelscan(void) +btestimateparallelscan(int nkeys, int norderbys) { - return sizeof(BTParallelScanDescData); + /* Pessimistically assume all input scankeys will be output with arrays */ + return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys; } /* @@ -572,7 +551,6 @@ btinitparallelscan(void *target) SpinLockInit(&bt_target->btps_mutex); bt_target->btps_scanPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - bt_target->btps_arrayKeyCount = 0; ConditionVariableInit(&bt_target->btps_cv); } @@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan) SpinLockAcquire(&btscan->btps_mutex); btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount = 0; SpinLockRelease(&btscan->btps_mutex); } @@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan) * or _bt_parallel_done(). * * The return value is true if we successfully seized the scan and false - * if we did not. The latter case occurs if no pages remain for the current - * set of scankeys. + * if we did not. The latter case occurs if no pages remain. * * If the return value is true, *pageno returns the next or current page * of the scan (depending on the scan direction). An invalid block number - * means the scan hasn't yet started, and P_NONE means we've reached the end. + * means the scan hasn't yet started, or that caller needs to start the next + * primitive index scan (if it's the latter case we'll set so.needPrimScan). * The first time a participating process reaches the last page, it will return * true and set *pageno to P_NONE; after that, further attempts to seize the * scan will return false. * * Callers should ignore the value of pageno if the return value is false. + * + * Callers that are in a position to start a new primitive index scan must + * pass first=true (all other callers pass first=false). We just return false + * for first=false callers that require another primitive index scan. */ bool -_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - BTPS_State pageStatus; bool exit_loop = false; bool status = true; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) *pageno = P_NONE; + if (first) + { + /* + * Initialize array related state when called from _bt_first, assuming + * that this will either be the first primitive index scan for the + * scan, or a previous explicitly scheduled primitive scan. + * + * Note: so->needPrimScan is only set when a scheduled primitive index + * scan is set to be performed in caller's worker process. It should + * not be set here by us for the first primitive scan, nor should we + * ever set it for a parallel scan that has no array keys. + */ + so->needPrimScan = false; + so->scanBehind = false; + } + else + { + /* + * Don't attempt to seize the scan when backend requires another + * primitive index scan unless we're in a position to start it now + */ + if (so->needPrimScan) + return false; + } + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); while (1) { SpinLockAcquire(&btscan->btps_mutex); - pageStatus = btscan->btps_pageStatus; - if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + if (btscan->btps_pageStatus == BTPARALLEL_DONE) { - /* Parallel scan has already advanced to a new set of scankeys. */ + /* We're done with this parallel index scan */ status = false; } - else if (pageStatus == BTPARALLEL_DONE) + else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN) { + Assert(so->numArrayKeys); + /* - * We're done with this set of scankeys. This may be the end, or - * there could be more sets to try. + * If we can start another primitive scan right away, do so. + * Otherwise just wait. */ - status = false; + if (first) + { + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + + array->cur_elem = btscan->btps_arrElems[i]; + skey->sk_argument = array->elem_values[array->cur_elem]; + } + so->needPrimScan = true; + so->scanBehind = false; + *pageno = InvalidBlockNumber; + exit_loop = true; + } } - else if (pageStatus != BTPARALLEL_ADVANCING) + else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING) { /* * We have successfully seized control of the scan for the purpose @@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) * _bt_parallel_release() -- Complete the process of advancing the scan to a * new page. We now have the new value btps_scanPage; some other backend * can now begin advancing the scan. + * + * Callers whose scan uses array keys must save their scan_page argument so + * that it can be passed to _bt_parallel_primscan_schedule, should caller + * determine that another primitive index scan is required. If that happens, + * scan_page won't be scanned by any backend (unless the next primitive index + * scan lands on scan_page). */ void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) @@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) void _bt_parallel_done(IndexScanDesc scan) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; bool status_changed = false; @@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan) parallel_scan->ps_offset); /* - * Mark the parallel scan as done for this combination of scan keys, - * unless some other process already did so. See also - * _bt_advance_array_keys. + * Mark the parallel scan as done, unless some other process did so + * already */ SpinLockAcquire(&btscan->btps_mutex); - if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && - btscan->btps_pageStatus != BTPARALLEL_DONE) + if (btscan->btps_pageStatus != BTPARALLEL_DONE) { btscan->btps_pageStatus = BTPARALLEL_DONE; status_changed = true; @@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan) } /* - * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array - * keys. + * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * - * Updates the count of array keys processed for both local and parallel - * scans. + * Caller passes the block number most recently passed to _bt_parallel_release + * by its backend. Caller successfully schedules the next primitive index scan + * if the shared parallel state hasn't been seized since caller's backend last + * advanced the scan. */ void -_bt_parallel_advance_array_keys(IndexScanDesc scan) +_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; + Assert(so->numArrayKeys); + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); - so->arrayKeyCount++; SpinLockAcquire(&btscan->btps_mutex); - if (btscan->btps_pageStatus == BTPARALLEL_DONE) + if (btscan->btps_scanPage == prev_scan_page && + btscan->btps_pageStatus == BTPARALLEL_IDLE) { btscan->btps_scanPage = InvalidBlockNumber; - btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount++; + btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; + + /* Serialize scan's current array keys */ + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + + btscan->btps_arrElems[i] = array->cur_elem; + } } SpinLockRelease(&btscan->btps_mutex); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index e3fff90d8e..d241e8ea1d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -907,7 +907,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ if (!so->qual_ok) { - /* Notify any other workers that we're done with this scan key. */ _bt_parallel_done(scan); return false; } @@ -917,10 +916,22 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * scan has not started, proceed to find out first leaf page in the usual * way while keeping other participating processes waiting. If the scan * has already begun, use the page number from the shared structure. + * + * When a parallel scan has another primitive index scan scheduled, a + * parallel worker will seize the scan for that purpose now. This is + * similar to the case where the top-level scan hasn't started. */ if (scan->parallel_scan != NULL) { - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, true); + + /* + * Initialize arrays (when _bt_parallel_seize didn't already set up + * the next primitive index scan) + */ + if (so->numArrayKeys && !so->needPrimScan) + _bt_start_array_keys(scan, dir); + if (!status) return false; else if (blkno == P_NONE) @@ -935,6 +946,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) goto readcomplete; } } + else if (so->numArrayKeys && !so->needPrimScan) + { + /* + * First _bt_first call (for current btrescan) without parallelism. + * + * Initialize arrays, and the corresponding scan keys that were just + * output by _bt_preprocess_keys. + */ + _bt_start_array_keys(scan, dir); + } /*---------- * Examine the scan keys to discover where we need to start the scan. @@ -980,6 +1001,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * * The selected scan keys (at most one per index column) are remembered by * storing their addresses into the local startKeys[] array. + * + * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start + * the next primitive index scan (for scans with array keys) based in part + * on an understanding of how it'll enable us to reposition the scan. + * They're directly aware of how we'll sometimes cons up an explicit + * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a + * symmetric "deduce NOT NULL" rule of their own. This allows top-level + * scans to skip large groups of NULLs through repeated deductions about + * key strictness (for a required inequality key) and whether NULLs in the + * key's index column are stored last or first (relative to non-NULLs). + * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might + * need to be kept in sync. *---------- */ strat_total = BTEqualStrategyNumber; @@ -1502,7 +1535,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * We scan the current page starting at offnum and moving in the indicated * direction. All items matching the scan keys are loaded into currPos.items. * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports - * that there can be no more matching tuples in the current scan direction. + * that there can be no more matching tuples in the current scan direction + * (could just be for the current primitive index scan when scan has arrays). * * _bt_first caller passes us an offnum returned by _bt_binsrch, which might * be an out of bounds offnum such as "maxoff + 1" in certain corner cases. @@ -1527,11 +1561,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int itemIndex; - bool continuescan; - int indnatts; - bool continuescanPrechecked; - bool haveFirstMatch = false; + BTReadPageState pstate; + bool arrayKeys; + int itemIndex, + indnatts; /* * We must have the buffer pinned and locked, but the usual macro can't be @@ -1546,16 +1579,32 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (scan->parallel_scan) { if (ScanDirectionIsForward(dir)) - _bt_parallel_release(scan, opaque->btpo_next); + pstate.prev_scan_page = opaque->btpo_next; else - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf); + + _bt_parallel_release(scan, pstate.prev_scan_page); } - continuescan = true; /* default assumption */ indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + arrayKeys = so->numArrayKeys != 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); + /* initialize page-level state that we'll pass to _bt_checkkeys */ + pstate.dir = dir; + pstate.minoff = minoff; + pstate.maxoff = maxoff; + pstate.finaltup = NULL; + pstate.page = page; + pstate.offnum = InvalidOffsetNumber; + pstate.skip = InvalidOffsetNumber; + pstate.continuescan = true; /* default assumption */ + pstate.prechecked = false; + pstate.firstmatch = false; + pstate.rechecks = 0; + pstate.targetdistance = 0; + /* * We note the buffer's block number so that we can release the pin later. * This allows us to re-read the buffer if it is needed again for hinting. @@ -1598,10 +1647,34 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * corresponding value from the last item on the page. So checking with * the last item on the page would give a more precise answer. * - * We skip this for the first page in the scan to evade the possible - * slowdown of the point queries. + * We skip this for the first page read by each (primitive) scan, to avoid + * slowing down point queries. They typically don't stand to gain much + * when the optimization can be applied, and are more likely to notice the + * overhead of the precheck. + * + * The optimization is unsafe and must be avoided whenever _bt_checkkeys + * just set a low-order required array's key to the best available match + * for a truncated -inf attribute value from the prior page's high key + * (array element 0 is always the best available match in this scenario). + * It's quite likely that matches for array element 0 begin on this page, + * but the start of matches won't necessarily align with page boundaries. + * When the start of matches is somewhere in the middle of this page, it + * would be wrong to treat page's final non-pivot tuple as representative. + * Doing so might lead us to treat some of the page's earlier tuples as + * being part of a group of tuples thought to satisfy the required keys. + * + * Note: Conversely, in the case where the scan's arrays just advanced + * using the prior page's HIKEY _without_ advancement setting scanBehind, + * the start of matches must be aligned with page boundaries, which makes + * it safe to attempt the optimization here now. It's also safe when the + * prior page's HIKEY simply didn't need to advance any required array. In + * both cases we can safely assume that the _first_ tuple from this page + * must be >= the current set of array keys/equality constraints. And so + * if the final tuple is == those same keys (and also satisfies any + * required < or <= strategy scan keys) during the precheck, we can safely + * assume that this must also be true of all earlier tuples from the page. */ - if (!firstPage && minoff < maxoff) + if (!firstPage && !so->scanBehind && minoff < maxoff) { ItemId iid; IndexTuple itup; @@ -1609,22 +1682,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff); itup = (IndexTuple) PageGetItem(page, iid); - /* - * Do the precheck. Note that we pass the pointer to the - * 'continuescanPrechecked' to the 'continuescan' argument. That will - * set flag to true if all required keys are satisfied and false - * otherwise. - */ - (void) _bt_checkkeys(scan, itup, indnatts, dir, - &continuescanPrechecked, false, false); - } - else - { - continuescanPrechecked = false; + /* Call with arrayKeys=false to avoid undesirable side-effects */ + _bt_checkkeys(scan, &pstate, false, itup, indnatts); + pstate.prechecked = pstate.continuescan; + pstate.continuescan = true; /* reset */ } if (ScanDirectionIsForward(dir)) { + /* SK_SEARCHARRAY forward scans must provide high key up front */ + if (arrayKeys && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in ascending order */ itemIndex = 0; @@ -1649,23 +1722,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, - continuescanPrechecked, - haveFirstMatch); + pstate.offnum = offnum; + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); /* - * If the result of prechecking required keys was true, then in - * assert-enabled builds we also recheck that the _bt_checkkeys() - * result is the same. + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) */ - Assert((!continuescanPrechecked && haveFirstMatch) || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false, false)); + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum < pstate.skip); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + if (passes_quals) { /* tuple passes all scan key conditions */ - haveFirstMatch = true; + pstate.firstmatch = true; if (!BTreeTupleIsPosting(itup)) { /* Remember it */ @@ -1696,7 +1774,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } } /* When !continuescan, there can't be any more matches, so stop */ - if (!continuescan) + if (!pstate.continuescan) break; offnum = OffsetNumberNext(offnum); @@ -1713,17 +1791,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * only appear on non-pivot tuples on the right sibling page are * common. */ - if (continuescan && !P_RIGHTMOST(opaque)) + if (pstate.continuescan && !P_RIGHTMOST(opaque)) { ItemId iid = PageGetItemId(page, P_HIKEY); IndexTuple itup = (IndexTuple) PageGetItem(page, iid); int truncatt; truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); - _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false); + pstate.prechecked = false; /* precheck didn't cover HIKEY */ + _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } - if (!continuescan) + if (!pstate.continuescan) so->currPos.moreRight = false; Assert(itemIndex <= MaxTIDsPerBTreePage); @@ -1733,6 +1812,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } else { + /* SK_SEARCHARRAY backward scans must provide final tuple up front */ + if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, minoff); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in descending order */ itemIndex = MaxTIDsPerBTreePage; @@ -1772,23 +1859,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, - continuescanPrechecked, - haveFirstMatch); + pstate.offnum = offnum; + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); /* - * If the result of prechecking required keys was true, then in - * assert-enabled builds we also recheck that the _bt_checkkeys() - * result is the same. + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) */ - Assert((!continuescanPrechecked && !haveFirstMatch) || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false, false)); + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum > pstate.skip); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions */ - haveFirstMatch = true; + pstate.firstmatch = true; if (!BTreeTupleIsPosting(itup)) { /* Remember it */ @@ -1824,7 +1916,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } } } - if (!continuescan) + if (!pstate.continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; @@ -1970,6 +2062,31 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->currPos.nextTupleOffset); so->markPos.itemIndex = so->markItemIndex; so->markItemIndex = -1; + + /* + * If we're just about to start the next primitive index scan + * (possible with a scan that has arrays keys, and needs to skip to + * continue in the current scan direction), moreLeft/moreRight only + * indicate the end of the current primitive index scan. They must + * never be taken to indicate that the top-level index scan has ended + * (that would be wrong). + * + * We could handle this case by treating the current array keys as + * markPos state. But depending on the current array state like this + * would add complexity. Instead, we just unset markPos's copy of + * moreRight or moreLeft (whichever might be affected), while making + * btrestpos reset the scan's arrays to their initial scan positions. + * In effect, btrestpos leaves advancing the arrays up to the first + * _bt_readpage call (that takes place after it has restored markPos). + */ + Assert(so->markPos.dir == dir); + if (so->needPrimScan) + { + if (ScanDirectionIsForward(dir)) + so->markPos.moreRight = true; + else + so->markPos.moreLeft = true; + } } if (ScanDirectionIsForward(dir)) @@ -1981,7 +2098,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * Seize the scan to get the next block number; if the scan has * ended already, bail out. */ - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); if (!status) { /* release the previous buffer, if pinned */ @@ -2013,7 +2130,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * Seize the scan to get the current block number; if the scan has * ended already, bail out. */ - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); BTScanPosUnpinIfPinned(so->currPos); if (!status) { @@ -2097,7 +2214,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) if (scan->parallel_scan != NULL) { _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); if (!status) { BTScanPosInvalidate(so->currPos); @@ -2193,7 +2310,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) if (scan->parallel_scan != NULL) { _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); if (!status) { BTScanPosInvalidate(so->currPos); @@ -2218,6 +2335,8 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + Assert(!so->needPrimScan); + _bt_initialize_more_data(so, dir); if (!_bt_readnextpage(scan, blkno, dir)) @@ -2524,14 +2643,22 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) } /* - * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately - * for scan direction + * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir + * from currPos */ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) { - /* initialize moreLeft/moreRight appropriately for scan direction */ - if (ScanDirectionIsForward(dir)) + so->currPos.dir = dir; + if (so->needPrimScan) + { + Assert(so->numArrayKeys); + + so->currPos.moreLeft = true; + so->currPos.moreRight = true; + so->needPrimScan = false; + } + else if (ScanDirectionIsForward(dir)) { so->currPos.moreLeft = false; so->currPos.moreRight = true; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index d50317096d..e963de78a7 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -29,29 +29,77 @@ #include "utils/memutils.h" #include "utils/rel.h" +#define LOOK_AHEAD_REQUIRED_RECHECKS 3 +#define LOOK_AHEAD_DEFAULT_DISTANCE 5 typedef struct BTSortArrayContext { - FmgrInfo flinfo; + FmgrInfo *sortproc; Oid collation; bool reverse; } BTSortArrayContext; +typedef struct BTScanKeyPreproc +{ + ScanKey skey; + int ikey; + int arrayidx; +} BTScanKeyPreproc; + +static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype, + FmgrInfo *orderproc, FmgrInfo **sortprocp); static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, - StrategyNumber strat, + Oid elemtype, StrategyNumber strat, Datum *elems, int nelems); -static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, - bool reverse, - Datum *elems, int nelems); +static int _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, + bool reverse, Datum *elems, int nelems); +static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey, + FmgrInfo *sortproc, bool reverse, + Oid origelemtype, Oid nextelemtype, + Datum *elems_orig, int *nelems_orig, + Datum *elems_next, int nelems_next); +static bool _bt_compare_array_scankey_args(IndexScanDesc scan, + ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, + bool *qual_ok); +static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan); +static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); static int _bt_compare_array_elements(const void *a, const void *b, void *arg); +static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur); +static int _bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); +static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); +static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, TupleDesc tupdesc, int tupnatts, + bool readpagetup, int sktrig, bool *scanBehind); +static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int sktrig, bool sktrig_required); +#ifdef USE_ASSERT_CHECKING +static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); +static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); +#endif static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, + BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result); static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); static void _bt_mark_scankey_required(ScanKey skey); +static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool advancenonrequired, bool prechecked, bool firstmatch, + bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan); +static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, + int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); @@ -188,29 +236,55 @@ _bt_freestack(BTStack stack) * * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and * set up BTArrayKeyInfo info for each one that is an equality-type key. - * Prepare modified scan keys in so->arrayKeyData, which will hold the current - * array elements during each primitive indexscan operation. For inequality - * array keys, it's sufficient to find the extreme element value and replace - * the whole array with that scalar value. + * Returns modified scan keys as input for further, standard preprocessing. * - * Note: the reason we need so->arrayKeyData, rather than just scribbling - * on scan->keyData, is that callers are permitted to call btrescan without - * supplying a new set of scankey data. + * Currently we perform two kinds of preprocessing to deal with redundancies. + * For inequality array keys, it's sufficient to find the extreme element + * value and replace the whole array with that scalar value. This eliminates + * all but one array element as redundant. Similarly, we are capable of + * "merging together" multiple equality array keys (from two or more input + * scan keys) into a single output scan key containing only the intersecting + * array elements. This can eliminate many redundant array elements, as well + * as eliminating whole array scan keys as redundant. It can also allow us to + * detect contradictory quals. + * + * It is convenient for _bt_preprocess_keys caller to have to deal with no + * more than one equality strategy array scan key per index attribute. We'll + * always be able to set things up that way when complete opfamilies are used. + * Eliminated array scan keys can be recognized as those that have had their + * sk_strategy field set to InvalidStrategy here by us. Caller should avoid + * including these in the scan's so->keyData[] output array. + * + * We set the scan key references from the scan's BTArrayKeyInfo info array to + * offsets into the temp modified input array returned to caller. Scans that + * have array keys should call _bt_preprocess_array_keys_final when standard + * preprocessing steps are complete. This will convert the scan key offset + * references into references to the scan's so->keyData[] output scan keys. + * + * Note: the reason we need to return a temp scan key array, rather than just + * scribbling on scan->keyData, is that callers are permitted to call btrescan + * without supplying a new set of scankey data. */ -void +static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; int numberOfKeys = scan->numberOfKeys; - int16 *indoption = scan->indexRelation->rd_indoption; + int16 *indoption = rel->rd_indoption; int numArrayKeys; + int origarrayatt = InvalidAttrNumber, + origarraykey = -1; + Oid origelemtype = InvalidOid; ScanKey cur; - int i; MemoryContext oldContext; + ScanKey arrayKeyData; /* modified copy of scan->keyData */ + + Assert(numberOfKeys); /* Quick check to see if there are any array keys */ numArrayKeys = 0; - for (i = 0; i < numberOfKeys; i++) + for (int i = 0; i < numberOfKeys; i++) { cur = &scan->keyData[i]; if (cur->sk_flags & SK_SEARCHARRAY) @@ -220,20 +294,15 @@ _bt_preprocess_array_keys(IndexScanDesc scan) /* If any arrays are null as a whole, we can quit right now. */ if (cur->sk_flags & SK_ISNULL) { - so->numArrayKeys = -1; - so->arrayKeyData = NULL; - return; + so->qual_ok = false; + return NULL; } } } /* Quit if nothing to do. */ if (numArrayKeys == 0) - { - so->numArrayKeys = 0; - so->arrayKeyData = NULL; - return; - } + return NULL; /* * Make a scan-lifespan context to hold array-associated data, or reset it @@ -249,18 +318,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan) oldContext = MemoryContextSwitchTo(so->arrayContext); /* Create modifiable copy of scan->keyData in the workspace context */ - so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); - memcpy(so->arrayKeyData, - scan->keyData, - scan->numberOfKeys * sizeof(ScanKeyData)); + arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData)); + memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ - so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo)); + so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); + + /* Allocate space for ORDER procs used to help _bt_checkkeys */ + so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo)); /* Now process each array key */ numArrayKeys = 0; - for (i = 0; i < numberOfKeys; i++) + for (int i = 0; i < numberOfKeys; i++) { + FmgrInfo sortproc; + FmgrInfo *sortprocp = &sortproc; + Oid elemtype; + bool reverse; ArrayType *arrayval; int16 elmlen; bool elmbyval; @@ -271,7 +345,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) int num_nonnulls; int j; - cur = &so->arrayKeyData[i]; + cur = &arrayKeyData[i]; if (!(cur->sk_flags & SK_SEARCHARRAY)) continue; @@ -305,10 +379,21 @@ _bt_preprocess_array_keys(IndexScanDesc scan) /* If there's no non-nulls, the scan qual is unsatisfiable */ if (num_nonnulls == 0) { - numArrayKeys = -1; + so->qual_ok = false; break; } + /* + * Determine the nominal datatype of the array elements. We have to + * support the convention that sk_subtype == InvalidOid means the + * opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + elemtype = cur->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[cur->sk_attno - 1]; + Assert(elemtype == ARR_ELEMTYPE(arrayval)); + /* * If the comparison operator is not equality, then the array qual * degenerates to a simple comparison against the smallest or largest @@ -319,7 +404,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) case BTLessStrategyNumber: case BTLessEqualStrategyNumber: cur->sk_argument = - _bt_find_extreme_element(scan, cur, + _bt_find_extreme_element(scan, cur, elemtype, BTGreaterStrategyNumber, elem_values, num_nonnulls); continue; @@ -329,7 +414,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: cur->sk_argument = - _bt_find_extreme_element(scan, cur, + _bt_find_extreme_element(scan, cur, elemtype, BTLessStrategyNumber, elem_values, num_nonnulls); continue; @@ -339,17 +424,93 @@ _bt_preprocess_array_keys(IndexScanDesc scan) break; } + /* + * We'll need a 3-way ORDER proc to perform binary searches for the + * next matching array element. Set that up now. + * + * Array scan keys with cross-type equality operators will require a + * separate same-type ORDER proc for sorting their array. Otherwise, + * sortproc just points to the same proc used during binary searches. + */ + _bt_setup_array_cmp(scan, cur, elemtype, + &so->orderProcs[i], &sortprocp); + /* * Sort the non-null elements and eliminate any duplicates. We must * sort in the same ordering used by the index column, so that the - * successive primitive indexscans produce data in index order. + * arrays can be advanced in lockstep with the scan's progress through + * the index's key space. */ - num_elems = _bt_sort_array_elements(scan, cur, - (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, + reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0; + num_elems = _bt_sort_array_elements(cur, sortprocp, reverse, elem_values, num_nonnulls); + if (origarrayatt == cur->sk_attno) + { + BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey]; + + /* + * This array scan key is redundant with a previous equality + * operator array scan key. Merge the two arrays together to + * eliminate contradictory non-intersecting elements (or try to). + * + * We merge this next array back into attribute's original array. + */ + Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno); + Assert(arrayKeyData[orig->scan_key].sk_collation == + cur->sk_collation); + if (_bt_merge_arrays(scan, cur, sortprocp, reverse, + origelemtype, elemtype, + orig->elem_values, &orig->num_elems, + elem_values, num_elems)) + { + /* Successfully eliminated this array */ + pfree(elem_values); + + /* + * If no intersecting elements remain in the original array, + * the scan qual is unsatisfiable + */ + if (orig->num_elems == 0) + { + so->qual_ok = false; + break; + } + + /* + * Indicate to _bt_preprocess_keys caller that it must ignore + * this scan key + */ + cur->sk_strategy = InvalidStrategy; + continue; + } + + /* + * Unable to merge this array with previous array due to a lack of + * suitable cross-type opfamily support. Will need to keep both + * scan keys/arrays. + */ + } + else + { + /* + * This array is the first for current index attribute. + * + * If it turns out to not be the last array (that is, if the next + * array is redundantly applied to this same index attribute), + * we'll then treat this array as the attribute's "original" array + * when merging. + */ + origarrayatt = cur->sk_attno; + origarraykey = numArrayKeys; + origelemtype = elemtype; + } + /* * And set up the BTArrayKeyInfo data. + * + * Note: _bt_preprocess_array_keys_final will fix-up each array's + * scan_key field later on, after so->keyData[] has been finalized. */ so->arrayKeys[numArrayKeys].scan_key = i; so->arrayKeys[numArrayKeys].num_elems = num_elems; @@ -360,6 +521,256 @@ _bt_preprocess_array_keys(IndexScanDesc scan) so->numArrayKeys = numArrayKeys; MemoryContextSwitchTo(oldContext); + + return arrayKeyData; +} + +/* + * _bt_preprocess_array_keys_final() -- fix up array scan key references + * + * When _bt_preprocess_array_keys performed initial array preprocessing, it + * set each array's array->scan_key to the array's arrayKeys[] entry offset + * (that also work as references into the original scan->keyData[] array). + * This function handles translation of the scan key references from the + * BTArrayKeyInfo info array, from input scan key references (to the keys in + * scan->keyData[]), into output references (to the keys in so->keyData[]). + * Caller's keyDataMap[] array tells us how to perform this remapping. + * + * Also finalizes so->orderProcs[] for the scan. Arrays already have an ORDER + * proc, which might need to be repositioned to its so->keyData[]-wise offset + * (very much like the remapping that we apply to array->scan_key references). + * Non-array equality strategy scan keys (that survived preprocessing) don't + * yet have an so->orderProcs[] entry, so we set one for them here. + * + * Also converts single-element array scan keys into equivalent non-array + * equality scan keys, which decrements so->numArrayKeys. It's possible that + * this will leave this new btrescan without any arrays at all. This isn't + * necessary for correctness; it's just an optimization. Non-array equality + * scan keys are slightly faster than equivalent array scan keys at runtime. + */ +static void +_bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + int arrayidx = 0; + int last_equal_output_ikey PG_USED_FOR_ASSERTS_ONLY = -1; + + Assert(so->qual_ok); + Assert(so->numArrayKeys); + + for (int output_ikey = 0; output_ikey < so->numberOfKeys; output_ikey++) + { + ScanKey outkey = so->keyData + output_ikey; + int input_ikey; + bool found PG_USED_FOR_ASSERTS_ONLY = false; + + Assert(outkey->sk_strategy != InvalidStrategy); + + if (outkey->sk_strategy != BTEqualStrategyNumber) + continue; + + input_ikey = keyDataMap[output_ikey]; + + Assert(last_equal_output_ikey < output_ikey); + Assert(last_equal_output_ikey < input_ikey); + last_equal_output_ikey = output_ikey; + + /* + * We're lazy about looking up ORDER procs for non-array keys, since + * not all input keys become output keys. Take care of it now. + */ + if (!(outkey->sk_flags & SK_SEARCHARRAY)) + { + Oid elemtype; + + /* No need for an ORDER proc given an IS NULL scan key */ + if (outkey->sk_flags & SK_SEARCHNULL) + continue; + + /* + * A non-required scan key doesn't need an ORDER proc, either + * (unless it's associated with an array, which this one isn't) + */ + if (!(outkey->sk_flags & SK_BT_REQFWD)) + continue; + + elemtype = outkey->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[outkey->sk_attno - 1]; + + _bt_setup_array_cmp(scan, outkey, elemtype, + &so->orderProcs[output_ikey], NULL); + continue; + } + + /* + * Reorder existing array scan key so->orderProcs[] entries. + * + * Doing this in-place is safe because preprocessing is required to + * output all equality strategy scan keys in original input order + * (among each group of entries against the same index attribute). + * This is also the order that the arrays themselves appear in. + */ + so->orderProcs[output_ikey] = so->orderProcs[input_ikey]; + + /* Fix-up array->scan_key references for arrays */ + for (; arrayidx < so->numArrayKeys; arrayidx++) + { + BTArrayKeyInfo *array = &so->arrayKeys[arrayidx]; + + Assert(array->num_elems > 0); + + if (array->scan_key == input_ikey) + { + /* found it */ + array->scan_key = output_ikey; + found = true; + + /* + * Transform array scan keys that have exactly 1 element + * remaining (following all prior preprocessing) into + * equivalent non-array scan keys. + */ + if (array->num_elems == 1) + { + outkey->sk_flags &= ~SK_SEARCHARRAY; + outkey->sk_argument = array->elem_values[0]; + so->numArrayKeys--; + + /* If we're out of array keys, we can quit right away */ + if (so->numArrayKeys == 0) + return; + + /* Shift other arrays forward */ + memmove(array, array + 1, + sizeof(BTArrayKeyInfo) * + (so->numArrayKeys - arrayidx)); + + /* + * Don't increment arrayidx (there was an entry that was + * just shifted forward to the offset at arrayidx, which + * will still need to be matched) + */ + } + else + { + /* Match found, so done with this array */ + arrayidx++; + } + + break; + } + } + + Assert(found); + } + + /* + * Parallel index scans require space in shared memory to store the + * current array elements (for arrays kept by preprocessing) to schedule + * the next primitive index scan. The underlying structure is protected + * using a spinlock, so defensively limit its size. In practice this can + * only affect parallel scans that use an incomplete opfamily. + */ + if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("number of array scan keys left by preprocessing (%d) exceeds the maximum allowed by parallel btree index scans (%d)", + so->numArrayKeys, INDEX_MAX_KEYS))); +} + +/* + * _bt_setup_array_cmp() -- Set up array comparison functions + * + * Sets ORDER proc in caller's orderproc argument, which is used during binary + * searches of arrays during the index scan. Also sets a same-type ORDER proc + * in caller's *sortprocp argument, which is used when sorting the array. + * + * Preprocessing calls here with all equality strategy scan keys (when scan + * uses equality array keys), including those not associated with any array. + * See _bt_advance_array_keys for an explanation of why it'll need to treat + * simple scalar equality scan keys as degenerate single element arrays. + * + * Caller should pass an orderproc pointing to space that'll store the ORDER + * proc for the scan, and a *sortprocp pointing to its own separate space. + * When calling here for a non-array scan key, sortprocp arg should be NULL. + * + * In the common case where we don't need to deal with cross-type operators, + * only one ORDER proc is actually required by caller. We'll set *sortprocp + * to point to the same memory that caller's orderproc continues to point to. + * Otherwise, *sortprocp will continue to point to caller's own space. Either + * way, *sortprocp will point to a same-type ORDER proc (since that's the only + * safe way to sort/deduplicate the array associated with caller's scan key). + */ +static void +_bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype, + FmgrInfo *orderproc, FmgrInfo **sortprocp) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + RegProcedure cmp_proc; + Oid opcintype = rel->rd_opcintype[skey->sk_attno - 1]; + + Assert(skey->sk_strategy == BTEqualStrategyNumber); + Assert(OidIsValid(elemtype)); + + /* + * If scankey operator is not a cross-type comparison, we can use the + * cached comparison function; otherwise gotta look it up in the catalogs + */ + if (elemtype == opcintype) + { + /* Set same-type ORDER procs for caller */ + *orderproc = *index_getprocinfo(rel, skey->sk_attno, BTORDER_PROC); + if (sortprocp) + *sortprocp = orderproc; + + return; + } + + /* + * Look up the appropriate cross-type comparison function in the opfamily. + * + * Use the opclass input type as the left hand arg type, and the array + * element type as the right hand arg type (since binary searches use an + * index tuple's attribute value to search for a matching array element). + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but only in cases where it's quite likely that _bt_first + * would fail in just the same way (had we not failed before it could). + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + opcintype, elemtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, opcintype, elemtype, skey->sk_attno, + RelationGetRelationName(rel)); + + /* Set cross-type ORDER proc for caller */ + fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext); + + /* Done if caller doesn't actually have an array they'll need to sort */ + if (!sortprocp) + return; + + /* + * Look up the appropriate same-type comparison function in the opfamily. + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but it seems quite unlikely that an opfamily would omit + * non-cross-type comparison procs for any datatype that it supports at + * all. + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + elemtype, elemtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, elemtype, elemtype, + skey->sk_attno, RelationGetRelationName(rel)); + + /* Set same-type ORDER proc for caller */ + fmgr_info_cxt(cmp_proc, *sortprocp, so->arrayContext); } /* @@ -370,27 +781,17 @@ _bt_preprocess_array_keys(IndexScanDesc scan) * least element, or BTGreaterStrategyNumber to get the greatest. */ static Datum -_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, +_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype, StrategyNumber strat, Datum *elems, int nelems) { Relation rel = scan->indexRelation; - Oid elemtype, - cmp_op; + Oid cmp_op; RegProcedure cmp_proc; FmgrInfo flinfo; Datum result; int i; - /* - * Determine the nominal datatype of the array elements. We have to - * support the convention that sk_subtype == InvalidOid means the opclass - * input type; this is a hack to simplify life for ScanKeyInit(). - */ - elemtype = skey->sk_subtype; - if (elemtype == InvalidOid) - elemtype = rel->rd_opcintype[skey->sk_attno - 1]; - /* * Look up the appropriate comparison operator in the opfamily. * @@ -399,6 +800,8 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, * non-cross-type comparison operators for any datatype that it supports * at all. */ + Assert(skey->sk_strategy != BTEqualStrategyNumber); + Assert(OidIsValid(elemtype)); cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1], elemtype, elemtype, @@ -433,50 +836,21 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, * The array elements are sorted in-place, and the new number of elements * after duplicate removal is returned. * - * scan and skey identify the index column, whose opfamily determines the - * comparison semantics. If reverse is true, we sort in descending order. + * skey identifies the index column whose opfamily determines the comparison + * semantics, and sortproc is a corresponding ORDER proc. If reverse is true, + * we sort in descending order. */ static int -_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, - bool reverse, +_bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, bool reverse, Datum *elems, int nelems) { - Relation rel = scan->indexRelation; - Oid elemtype; - RegProcedure cmp_proc; BTSortArrayContext cxt; if (nelems <= 1) return nelems; /* no work to do */ - /* - * Determine the nominal datatype of the array elements. We have to - * support the convention that sk_subtype == InvalidOid means the opclass - * input type; this is a hack to simplify life for ScanKeyInit(). - */ - elemtype = skey->sk_subtype; - if (elemtype == InvalidOid) - elemtype = rel->rd_opcintype[skey->sk_attno - 1]; - - /* - * Look up the appropriate comparison function in the opfamily. - * - * Note: it's possible that this would fail, if the opfamily is - * incomplete, but it seems quite unlikely that an opfamily would omit - * non-cross-type support functions for any datatype that it supports at - * all. - */ - cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], - elemtype, - elemtype, - BTORDER_PROC); - if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", - BTORDER_PROC, elemtype, elemtype, - rel->rd_opfamily[skey->sk_attno - 1]); - /* Sort the array elements */ - fmgr_info(cmp_proc, &cxt.flinfo); + cxt.sortproc = sortproc; cxt.collation = skey->sk_collation; cxt.reverse = reverse; qsort_arg(elems, nelems, sizeof(Datum), @@ -487,6 +861,232 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, _bt_compare_array_elements, &cxt); } +/* + * _bt_merge_arrays() -- merge next array's elements into an original array + * + * Called when preprocessing encounters a pair of array equality scan keys, + * both against the same index attribute (during initial array preprocessing). + * Merging reorganizes caller's original array (the left hand arg) in-place, + * without ever copying elements from one array into the other. (Mixing the + * elements together like this would be wrong, since they don't necessarily + * use the same underlying element type, despite all the other similarities.) + * + * Both arrays must have already been sorted and deduplicated by calling + * _bt_sort_array_elements. sortproc is the same-type ORDER proc that was + * just used to sort and deduplicate caller's "next" array. We'll usually be + * able to reuse that order PROC to merge the arrays together now. If not, + * then we'll perform a separate ORDER proc lookup. + * + * If the opfamily doesn't supply a complete set of cross-type ORDER procs we + * may not be able to determine which elements are contradictory. If we have + * the required ORDER proc then we return true (and validly set *nelems_orig), + * guaranteeing that at least the next array can be considered redundant. We + * return false if the required comparisons cannot not be made (caller must + * keep both arrays when this happens). + */ +static bool +_bt_merge_arrays(IndexScanDesc scan, ScanKey skey, FmgrInfo *sortproc, + bool reverse, Oid origelemtype, Oid nextelemtype, + Datum *elems_orig, int *nelems_orig, + Datum *elems_next, int nelems_next) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSortArrayContext cxt; + int nelems_orig_start = *nelems_orig, + nelems_orig_merged = 0; + FmgrInfo *mergeproc = sortproc; + FmgrInfo crosstypeproc; + + Assert(skey->sk_strategy == BTEqualStrategyNumber); + Assert(OidIsValid(origelemtype) && OidIsValid(nextelemtype)); + + if (origelemtype != nextelemtype) + { + RegProcedure cmp_proc; + + /* + * Cross-array-element-type merging is required, so can't just reuse + * sortproc when merging + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + origelemtype, nextelemtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + { + /* Can't make the required comparisons */ + return false; + } + + /* We have all we need to determine redundancy/contradictoriness */ + mergeproc = &crosstypeproc; + fmgr_info_cxt(cmp_proc, mergeproc, so->arrayContext); + } + + cxt.sortproc = mergeproc; + cxt.collation = skey->sk_collation; + cxt.reverse = reverse; + + for (int i = 0, j = 0; i < nelems_orig_start && j < nelems_next;) + { + Datum *oelem = elems_orig + i, + *nelem = elems_next + j; + int res = _bt_compare_array_elements(oelem, nelem, &cxt); + + if (res == 0) + { + elems_orig[nelems_orig_merged++] = *oelem; + i++; + j++; + } + else if (res < 0) + i++; + else /* res > 0 */ + j++; + } + + *nelems_orig = nelems_orig_merged; + + return true; +} + +/* + * Compare an array scan key to a scalar scan key, eliminating contradictory + * array elements such that the scalar scan key becomes redundant. + * + * Array elements can be eliminated as contradictory when excluded by some + * other operator on the same attribute. For example, with an index scan qual + * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1" + * are eliminated, and the < scan key is eliminated as redundant. Cases where + * every array element is eliminated by a redundant scalar scan key have an + * unsatisfiable qual, which we handle by setting *qual_ok=false for caller. + * + * If the opfamily doesn't supply a complete set of cross-type ORDER procs we + * may not be able to determine which elements are contradictory. If we have + * the required ORDER proc then we return true (and validly set *qual_ok), + * guaranteeing that at least the scalar scan key can be considered redundant. + * We return false if the comparison could not be made (caller must keep both + * scan keys when this happens). + */ +static bool +_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, + bool *qual_ok) +{ + Relation rel = scan->indexRelation; + Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1]; + int cmpresult = 0, + cmpexact = 0, + matchelem, + new_nelems = 0; + FmgrInfo crosstypeproc; + FmgrInfo *orderprocp = orderproc; + + Assert(arraysk->sk_attno == skey->sk_attno); + Assert(array->num_elems > 0); + Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert((arraysk->sk_flags & SK_SEARCHARRAY) && + arraysk->sk_strategy == BTEqualStrategyNumber); + Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!(skey->sk_flags & SK_SEARCHARRAY) || + skey->sk_strategy != BTEqualStrategyNumber); + + /* + * _bt_binsrch_array_skey searches an array for the entry best matching a + * datum of opclass input type for the index's attribute (on-disk type). + * We can reuse the array's ORDER proc whenever the non-array scan key's + * type is a match for the corresponding attribute's input opclass type. + * Otherwise, we have to do another ORDER proc lookup so that our call to + * _bt_binsrch_array_skey applies the correct comparator. + * + * Note: we have to support the convention that sk_subtype == InvalidOid + * means the opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + if (skey->sk_subtype != opcintype && skey->sk_subtype != InvalidOid) + { + RegProcedure cmp_proc; + Oid arraysk_elemtype; + + /* + * Need an ORDER proc lookup to detect redundancy/contradictoriness + * with this pair of scankeys. + * + * Scalar scan key's argument will be passed to _bt_compare_array_skey + * as its tupdatum/lefthand argument (rhs arg is for array elements). + */ + arraysk_elemtype = arraysk->sk_subtype; + if (arraysk_elemtype == InvalidOid) + arraysk_elemtype = rel->rd_opcintype[arraysk->sk_attno - 1]; + cmp_proc = get_opfamily_proc(rel->rd_opfamily[arraysk->sk_attno - 1], + skey->sk_subtype, arraysk_elemtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + { + /* Can't make the comparison */ + *qual_ok = false; /* suppress compiler warnings */ + return false; + } + + /* We have all we need to determine redundancy/contradictoriness */ + orderprocp = &crosstypeproc; + fmgr_info(cmp_proc, orderprocp); + } + + matchelem = _bt_binsrch_array_skey(orderprocp, false, + NoMovementScanDirection, + skey->sk_argument, false, array, + arraysk, &cmpresult); + + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + cmpexact = 1; /* exclude exact match, if any */ + /* FALL THRU */ + case BTLessEqualStrategyNumber: + if (cmpresult >= cmpexact) + matchelem++; + /* Resize, keeping elements from the start of the array */ + new_nelems = matchelem; + break; + case BTEqualStrategyNumber: + if (cmpresult != 0) + { + /* qual is unsatisfiable */ + new_nelems = 0; + } + else + { + /* Shift matching element to the start of the array, resize */ + array->elem_values[0] = array->elem_values[matchelem]; + new_nelems = 1; + } + break; + case BTGreaterEqualStrategyNumber: + cmpexact = 1; /* include exact match, if any */ + /* FALL THRU */ + case BTGreaterStrategyNumber: + if (cmpresult >= cmpexact) + matchelem++; + /* Shift matching elements to the start of the array, resize */ + new_nelems = array->num_elems - matchelem; + memmove(array->elem_values, array->elem_values + matchelem, + sizeof(Datum) * new_nelems); + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } + + Assert(new_nelems >= 0); + Assert(new_nelems <= array->num_elems); + + array->num_elems = new_nelems; + *qual_ok = new_nelems > 0; + + return true; +} + /* * qsort_arg comparator for sorting array elements */ @@ -498,7 +1098,7 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) BTSortArrayContext *cxt = (BTSortArrayContext *) arg; int32 compare; - compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo, + compare = DatumGetInt32(FunctionCall2Coll(cxt->sortproc, cxt->collation, da, db)); if (cxt->reverse) @@ -506,11 +1106,233 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) return compare; } +/* + * _bt_compare_array_skey() -- apply array comparison function + * + * Compares caller's tuple attribute value to a scan key/array element. + * Helper function used during binary searches of SK_SEARCHARRAY arrays. + * + * This routine returns: + * <0 if tupdatum < arrdatum; + * 0 if tupdatum == arrdatum; + * >0 if tupdatum > arrdatum. + * + * This is essentially the same interface as _bt_compare: both functions + * compare the value that they're searching for to a binary search pivot. + * However, unlike _bt_compare, this function's "tuple argument" comes first, + * while its "array/scankey argument" comes second. +*/ +static inline int32 +_bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur) +{ + int32 result = 0; + + Assert(cur->sk_strategy == BTEqualStrategyNumber); + + if (tupnull) /* NULL tupdatum */ + { + if (cur->sk_flags & SK_ISNULL) + result = 0; /* NULL "=" NULL */ + else if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * Like _bt_compare, we need to be careful of cross-type comparisons, + * so the left value has to be the value that came from an index tuple + */ + result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation, + tupdatum, arrdatum)); + + /* + * We flip the sign by following the obvious rule: flip whenever the + * column is a DESC column. + * + * _bt_compare does it the wrong way around (flip when *ASC*) in order + * to compensate for passing its orderproc arguments backwards. We + * don't need to play these games because we find it natural to pass + * tupdatum as the left value (and arrdatum as the right value). + */ + if (cur->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(result); + } + + return result; +} + +/* + * _bt_binsrch_array_skey() -- Binary search for next matching array key + * + * Returns an index to the first array element >= caller's tupdatum argument. + * This convention is more natural for forwards scan callers, but that can't + * really matter to backwards scan callers. Both callers require handling for + * the case where the match we return is < tupdatum, and symmetric handling + * for the case where our best match is > tupdatum. + * + * Also sets *set_elem_result to the result _bt_compare_array_skey returned + * when we used it to compare the matching array element to tupdatum/tupnull. + * + * cur_elem_trig indicates if array advancement was triggered by this array's + * scan key, and that the array is for a required scan key. We can apply this + * information to find the next matching array element in the current scan + * direction using far fewer comparisons (fewer on average, compared to naive + * binary search). This scheme takes advantage of an important property of + * required arrays: required arrays always advance in lockstep with the index + * scan's progress through the index's key space. + */ +static int +_bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + int low_elem = 0, + mid_elem = -1, + high_elem = array->num_elems - 1, + result = 0; + Datum arrdatum; + + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_strategy == BTEqualStrategyNumber); + + if (cur_elem_trig) + { + Assert(!ScanDirectionIsNoMovement(dir)); + Assert(cur->sk_flags & SK_BT_REQFWD); + + /* + * When the scan key that triggered array advancement is a required + * array scan key, it is now certain that the current array element + * (plus all prior elements relative to the current scan direction) + * cannot possibly be at or ahead of the corresponding tuple value. + * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which + * makes sure this is true as a condition of advancing the arrays.) + * + * This makes it safe to exclude array elements up to and including + * the former-current array element from our search. + * + * Separately, when array advancement was triggered by a required scan + * key, the array element immediately after the former-current element + * is often either an exact tupdatum match, or a "close by" near-match + * (a near-match tupdatum is one whose key space falls _between_ the + * former-current and new-current array elements). We'll detect both + * cases via an optimistic comparison of the new search lower bound + * (or new search upper bound in the case of backwards scans). + */ + if (ScanDirectionIsForward(dir)) + { + low_elem = array->cur_elem + 1; /* old cur_elem exhausted */ + + /* Compare prospective new cur_elem (also the new lower bound) */ + if (high_elem >= low_elem) + { + arrdatum = array->elem_values[low_elem]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result <= 0) + { + /* Optimistic comparison optimization worked out */ + *set_elem_result = result; + return low_elem; + } + mid_elem = low_elem; + low_elem++; /* this cur_elem exhausted, too */ + } + + if (high_elem < low_elem) + { + /* Caller needs to perform "beyond end" array advancement */ + *set_elem_result = 1; + return high_elem; + } + } + else + { + high_elem = array->cur_elem - 1; /* old cur_elem exhausted */ + + /* Compare prospective new cur_elem (also the new upper bound) */ + if (high_elem >= low_elem) + { + arrdatum = array->elem_values[high_elem]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result >= 0) + { + /* Optimistic comparison optimization worked out */ + *set_elem_result = result; + return high_elem; + } + mid_elem = high_elem; + high_elem--; /* this cur_elem exhausted, too */ + } + + if (high_elem < low_elem) + { + /* Caller needs to perform "beyond end" array advancement */ + *set_elem_result = -1; + return low_elem; + } + } + } + + while (high_elem > low_elem) + { + mid_elem = low_elem + ((high_elem - low_elem) / 2); + arrdatum = array->elem_values[mid_elem]; + + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result == 0) + { + /* + * It's safe to quit as soon as we see an equal array element. + * This often saves an extra comparison or two... + */ + low_elem = mid_elem; + break; + } + + if (result > 0) + low_elem = mid_elem + 1; + else + high_elem = mid_elem; + } + + /* + * ...but our caller also cares about how its searched-for tuple datum + * compares to the low_elem datum. Must always set *set_elem_result with + * the result of that comparison specifically. + */ + if (low_elem != mid_elem) + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + array->elem_values[low_elem], cur); + + *set_elem_result = result; + + return low_elem; +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * * Set up the cur_elem counters and fill in the first sk_argument value for - * each array scankey. We can't do this until we know the scan direction. + * each array scankey. */ void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) @@ -518,159 +1340,1132 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; int i; + Assert(so->numArrayKeys); + Assert(so->qual_ok); + for (i = 0; i < so->numArrayKeys; i++) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + ScanKey skey = &so->keyData[curArrayKey->scan_key]; Assert(curArrayKey->num_elems > 0); + Assert(skey->sk_flags & SK_SEARCHARRAY); + if (ScanDirectionIsBackward(dir)) curArrayKey->cur_elem = curArrayKey->num_elems - 1; else curArrayKey->cur_elem = 0; skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; } - - so->arraysStarted = true; + so->scanBehind = false; } /* - * _bt_advance_array_keys() -- Advance to next set of array elements + * _bt_advance_array_keys_increment() -- Advance to next set of array elements + * + * Advances the array keys by a single increment in the current scan + * direction. When there are multiple array keys this can roll over from the + * lowest order array to higher order arrays. * * Returns true if there is another set of values to consider, false if not. * On true result, the scankeys are initialized with the next set of values. + * On false result, the scankeys stay the same, and the array keys are not + * advanced (every array remains at its final element for scan direction). */ -bool -_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) +static bool +_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - bool found = false; - int i; /* * We must advance the last array key most quickly, since it will * correspond to the lowest-order index column among the available - * qualifications. This is necessary to ensure correct ordering of output - * when there are multiple array keys. + * qualifications */ - for (i = so->numArrayKeys - 1; i >= 0; i--) + for (int i = so->numArrayKeys - 1; i >= 0; i--) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + ScanKey skey = &so->keyData[curArrayKey->scan_key]; int cur_elem = curArrayKey->cur_elem; int num_elems = curArrayKey->num_elems; + bool rolled = false; - if (ScanDirectionIsBackward(dir)) + if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems) { - if (--cur_elem < 0) - { - cur_elem = num_elems - 1; - found = false; /* need to advance next array key */ - } - else - found = true; + cur_elem = 0; + rolled = true; } - else + else if (ScanDirectionIsBackward(dir) && --cur_elem < 0) { - if (++cur_elem >= num_elems) - { - cur_elem = 0; - found = false; /* need to advance next array key */ - } - else - found = true; + cur_elem = num_elems - 1; + rolled = true; } curArrayKey->cur_elem = cur_elem; skey->sk_argument = curArrayKey->elem_values[cur_elem]; - if (found) - break; - } + if (!rolled) + return true; - /* advance parallel scan */ - if (scan->parallel_scan != NULL) - _bt_parallel_advance_array_keys(scan); + /* Need to advance next array key, if any */ + } /* - * When no new array keys were found, the scan is "past the end" of the - * array keys. _bt_start_array_keys can still "restart" the array keys if - * a rescan is required. + * The array keys are now exhausted. (There isn't actually a distinct + * state that represents array exhaustion, since index scans don't always + * end after btgettuple returns "false".) + * + * Restore the array keys to the state they were in immediately before we + * were called. This ensures that the arrays only ever ratchet in the + * current scan direction. Without this, scans would overlook matching + * tuples if and when the scan's direction was subsequently reversed. */ - if (!found) - so->arraysStarted = false; + _bt_start_array_keys(scan, -dir); - return found; + return false; } /* - * _bt_mark_array_keys() -- Handle array keys during btmarkpos + * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays * - * Save the current state of the array keys as the "mark" position. + * Called when _bt_advance_array_keys decides to start a new primitive index + * scan on the basis of the current scan position being before the position + * that _bt_first is capable of repositioning the scan to by applying an + * inequality operator required in the opposite-to-scan direction only. + * + * Although equality strategy scan keys (for both arrays and non-arrays alike) + * are either marked required in both directions or in neither direction, + * there is a sense in which non-required arrays behave like required arrays. + * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)", + * the scan key on "c" is non-required, but nevertheless enables positioning + * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the + * first descent of the tree by _bt_first. Later on, there could also be a + * second descent, that places the scan right before tuples >= "(200, 3, 5)". + * _bt_first must never be allowed to build an insertion scan key whose "c" + * entry is set to a value other than 5, the "c" array's first element/value. + * (Actually, it's the first in the current scan direction. This example uses + * a forward scan.) + * + * Calling here resets the array scan key elements for the scan's non-required + * arrays. This is strictly necessary for correctness in a subset of cases + * involving "required in opposite direction"-triggered primitive index scans. + * Not all callers are at risk of _bt_first using a non-required array like + * this, but advancement always resets the arrays when another primitive scan + * is scheduled, just to keep things simple. Array advancement even makes + * sure to reset non-required arrays during scans that have no inequalities. + * (Advancement still won't call here when there are no inequalities, though + * that's just because it's all handled indirectly instead.) + * + * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that + * everybody got this right. */ -void -_bt_mark_array_keys(IndexScanDesc scan) +static void +_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; + int arrayidx = 0; - for (i = 0; i < so->numArrayKeys; i++) + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + int first_elem_dir; - curArrayKey->mark_elem = curArrayKey->cur_elem; + if (!(cur->sk_flags & SK_SEARCHARRAY) || + cur->sk_strategy != BTEqualStrategyNumber) + continue; + + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + + if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) + continue; + + if (ScanDirectionIsForward(dir)) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array->cur_elem != first_elem_dir) + { + array->cur_elem = first_elem_dir; + cur->sk_argument = array->elem_values[first_elem_dir]; + } } } /* - * _bt_restore_array_keys() -- Handle array keys during btrestrpos + * _bt_tuple_before_array_skeys() -- too early to advance required arrays? * - * Restore the array keys to where they were when the mark was set. + * We always compare the tuple using the current array keys (which we assume + * are already set in so->keyData[]). readpagetup indicates if tuple is the + * scan's current _bt_readpage-wise tuple. + * + * readpagetup callers must only call here when _bt_check_compare already set + * continuescan=false. We help these callers deal with _bt_check_compare's + * inability to distinguishing between the < and > cases (it uses equality + * operator scan keys, whereas we use 3-way ORDER procs). These callers pass + * a _bt_check_compare-set sktrig value that indicates which scan key + * triggered the call (!readpagetup callers just pass us sktrig=0 instead). + * This information allows us to avoid wastefully checking earlier scan keys + * that were already deemed to have been satisfied inside _bt_check_compare. + * + * Returns false when caller's tuple is >= the current required equality scan + * keys (or <=, in the case of backwards scans). This happens to readpagetup + * callers when the scan has reached the point of needing its array keys + * advanced; caller will need to advance required and non-required arrays at + * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over. + * (When we return false to readpagetup callers, tuple can only be == current + * required equality scan keys when caller's sktrig indicates that the arrays + * need to be advanced due to an unsatisfied required inequality key trigger.) + * + * Returns true when caller passes a tuple that is < the current set of + * equality keys for the most significant non-equal required scan key/column + * (or > the keys, during backwards scans). This happens to readpagetup + * callers when tuple is still before the start of matches for the scan's + * required equality strategy scan keys. (sktrig can't have indicated that an + * inequality strategy scan key wasn't satisfied in _bt_check_compare when we + * return true. In fact, we automatically return false when passed such an + * inequality sktrig by readpagetup callers -- _bt_check_compare's initial + * continuescan=false doesn't really need to be confirmed here by us.) + * + * !readpagetup callers optionally pass us *scanBehind, which tracks whether + * any missing truncated attributes might have affected array advancement + * (compared to what would happen if it was shown the first non-pivot tuple on + * the page to the right of caller's finaltup/high key tuple instead). It's + * only possible that we'll set *scanBehind to true when caller passes us a + * pivot tuple (with truncated -inf attributes) that we return false for. */ -void -_bt_restore_array_keys(IndexScanDesc scan) +static bool +_bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, TupleDesc tupdesc, int tupnatts, + bool readpagetup, int sktrig, bool *scanBehind) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - bool changed = false; - int i; - /* Restore each array key to its position when the mark was set */ - for (i = 0; i < so->numArrayKeys; i++) + Assert(so->numArrayKeys); + Assert(so->numberOfKeys); + Assert(sktrig == 0 || readpagetup); + Assert(!readpagetup || scanBehind == NULL); + + if (scanBehind) + *scanBehind = false; + + for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; - int mark_elem = curArrayKey->mark_elem; + ScanKey cur = so->keyData + ikey; + Datum tupdatum; + bool tupnull; + int32 result; - if (curArrayKey->cur_elem != mark_elem) + /* readpagetup calls require one ORDER proc comparison (at most) */ + Assert(!readpagetup || ikey == sktrig); + + /* + * Once we reach a non-required scan key, we're completely done. + * + * Note: we deliberately don't consider the scan direction here. + * _bt_advance_array_keys caller requires that we track *scanBehind + * without concern for scan direction. + */ + if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0) { - curArrayKey->cur_elem = mark_elem; - skey->sk_argument = curArrayKey->elem_values[mark_elem]; - changed = true; + Assert(!readpagetup); + Assert(ikey > sktrig || ikey == 0); + return false; + } + + if (cur->sk_attno > tupnatts) + { + Assert(!readpagetup); + + /* + * When we reach a high key's truncated attribute, assume that the + * tuple attribute's value is >= the scan's equality constraint + * scan keys (but set *scanBehind to let interested callers know + * that a truncated attribute might have affected our answer). + */ + if (scanBehind) + *scanBehind = true; + + return false; + } + + /* + * Deal with inequality strategy scan keys that _bt_check_compare set + * continuescan=false for + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + { + /* + * When _bt_check_compare indicated that a required inequality + * scan key wasn't satisfied, there's no need to verify anything; + * caller always calls _bt_advance_array_keys with this sktrig. + */ + if (readpagetup) + return false; + + /* + * Otherwise we can't give up, since we must check all required + * scan keys (required in either direction) in order to correctly + * track *scanBehind for caller + */ + continue; + } + + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + + /* + * Does this comparison indicate that caller must _not_ advance the + * scan's arrays just yet? + */ + if ((ScanDirectionIsForward(dir) && result < 0) || + (ScanDirectionIsBackward(dir) && result > 0)) + return true; + + /* + * Does this comparison indicate that caller should now advance the + * scan's arrays? (Must be if we get here during a readpagetup call.) + */ + if (readpagetup || result != 0) + { + Assert(result != 0); + return false; + } + + /* + * Inconclusive -- need to check later scan keys, too. + * + * This must be a finaltup precheck, or a call made from an assertion. + */ + Assert(result == 0); + } + + Assert(!readpagetup); + + return false; +} + +/* + * _bt_start_prim_scan() -- start scheduled primitive index scan? + * + * Returns true if _bt_checkkeys scheduled another primitive index scan, just + * as the last one ended. Otherwise returns false, indicating that the array + * keys are now fully exhausted. + * + * Only call here during scans with one or more equality type array scan keys, + * after _bt_first or _bt_next return false. + */ +bool +_bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(so->numArrayKeys); + + /* scanBehind flag doesn't persist across primitive index scans - reset */ + so->scanBehind = false; + + /* + * Array keys are advanced within _bt_checkkeys when the scan reaches the + * leaf level (more precisely, they're advanced when the scan reaches the + * end of each distinct set of array elements). This process avoids + * repeat access to leaf pages (across multiple primitive index scans) by + * advancing the scan's array keys when it allows the primitive index scan + * to find nearby matching tuples (or when it eliminates ranges of array + * key space that can't possibly be satisfied by any index tuple). + * + * _bt_checkkeys sets a simple flag variable to schedule another primitive + * index scan. The flag tells us what to do. + * + * We cannot rely on _bt_first always reaching _bt_checkkeys. There are + * various cases where that won't happen. For example, if the index is + * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys. + * We also don't expect a call to _bt_checkkeys during searches for a + * non-existent value that happens to be lower/higher than any existing + * value in the index. + * + * We don't require special handling for these cases -- we don't need to + * be explicitly instructed to _not_ perform another primitive index scan. + * It's up to code under the control of _bt_first to always set the flag + * when another primitive index scan will be required. + * + * This works correctly, even with the tricky cases listed above, which + * all involve access to leaf pages "near the boundaries of the key space" + * (whether it's from a leftmost/rightmost page, or an imaginary empty + * leaf root page). If _bt_checkkeys cannot be reached by a primitive + * index scan for one set of array keys, then it also won't be reached for + * any later set ("later" in terms of the direction that we scan the index + * and advance the arrays). The array keys won't have advanced in these + * cases, but that's the correct behavior (even _bt_advance_array_keys + * won't always advance the arrays at the point they become "exhausted"). + */ + if (so->needPrimScan) + { + Assert(_bt_verify_arrays_bt_first(scan, dir)); + + /* + * Flag was set -- must call _bt_first again, which will reset the + * scan's needPrimScan flag + */ + return true; + } + + /* The top-level index scan ran out of tuples in this scan direction */ + if (scan->parallel_scan != NULL) + _bt_parallel_done(scan); + + return false; +} + +/* + * _bt_advance_array_keys() -- Advance array elements using a tuple + * + * The scan always gets a new qual as a consequence of calling here (except + * when we determine that the top-level scan has run out of matching tuples). + * All later _bt_check_compare calls also use the same new qual that was first + * used here (at least until the next call here advances the keys once again). + * It's convenient to structure _bt_check_compare rechecks of caller's tuple + * (using the new qual) as one the steps of advancing the scan's array keys, + * so this function works as a wrapper around _bt_check_compare. + * + * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the + * caller, and return a boolean indicating if caller's tuple satisfies the + * scan's new qual. But unlike _bt_check_compare, we set so->needPrimScan + * when we set continuescan=false, indicating if a new primitive index scan + * has been scheduled (otherwise, the top-level scan has run out of tuples in + * the current scan direction). + * + * Caller must use _bt_tuple_before_array_skeys to determine if the current + * place in the scan is >= the current array keys _before_ calling here. + * We're responsible for ensuring that caller's tuple is <= the newly advanced + * required array keys once we return. We try to find an exact match, but + * failing that we'll advance the array keys to whatever set of array elements + * comes next in the key space for the current scan direction. Required array + * keys "ratchet forwards" (or backwards). They can only advance as the scan + * itself advances through the index/key space. + * + * (The rules are the same for backwards scans, except that the operators are + * flipped: just replace the precondition's >= operator with a <=, and the + * postcondition's <= operator with with a >=. In other words, just swap the + * precondition with the postcondition.) + * + * We also deal with "advancing" non-required arrays here. Callers whose + * sktrig scan key is non-required specify sktrig_required=false. These calls + * are the only exception to the general rule about always advancing the + * required array keys (the scan may not even have a required array). These + * callers should just pass a NULL pstate (since there is never any question + * of stopping the scan). No call to _bt_tuple_before_array_skeys is required + * ahead of these calls (it's already clear that any required scan keys must + * be satisfied by caller's tuple). + * + * Note that we deal with non-array required equality strategy scan keys as + * degenerate single element arrays here. Obviously, they can never really + * advance in the way that real arrays can, but they must still affect how we + * advance real array scan keys (exactly like true array equality scan keys). + * We have to keep around a 3-way ORDER proc for these (using the "=" operator + * won't do), since in general whether the tuple is < or > _any_ unsatisfied + * required equality key influences how the scan's real arrays must advance. + * + * Note also that we may sometimes need to advance the array keys when the + * existing required array keys (and other required equality keys) are already + * an exact match for every corresponding value from caller's tuple. We must + * do this for inequalities that _bt_check_compare set continuescan=false for. + * They'll advance the array keys here, just like any other scan key that + * _bt_check_compare stops on. (This can even happen _after_ we advance the + * array keys, in which case we'll advance the array keys a second time. That + * way _bt_checkkeys caller always has its required arrays advance to the + * maximum possible extent that its tuple will allow.) + */ +static bool +_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int sktrig, bool sktrig_required) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = pstate ? pstate->dir : ForwardScanDirection; + int arrayidx = 0; + bool beyond_end_advance = false, + has_required_opposite_direction_only = false, + oppodir_inequality_sktrig = false, + all_required_satisfied = true, + all_satisfied = true; + + if (sktrig_required) + { + /* + * Precondition array state assertion + */ + Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, + tupnatts, false, 0, NULL)); + + so->scanBehind = false; /* reset */ + + /* + * Required scan key wasn't satisfied, so required arrays will have to + * advance. Invalidate page-level state that tracks whether the + * scan's required-in-opposite-direction-only keys are known to be + * satisfied by page's remaining tuples. + */ + pstate->firstmatch = false; + + /* Shouldn't have to invalidate 'prechecked', though */ + Assert(!pstate->prechecked); + + /* + * Once we return we'll have a new set of required array keys, so + * reset state used by "look ahead" optimization + */ + pstate->rechecks = 0; + pstate->targetdistance = 0; + } + + Assert(_bt_verify_keys_with_arraykeys(scan)); + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + Datum tupdatum; + bool required = false, + required_opposite_direction_only = false, + tupnull; + int32 result; + int set_elem = 0; + + if (cur->sk_strategy == BTEqualStrategyNumber) + { + /* Manage array state */ + if (cur->sk_flags & SK_SEARCHARRAY) + { + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + } + } + else + { + /* + * Are any inequalities required in the opposite direction only + * present here? + */ + if (((ScanDirectionIsForward(dir) && + (cur->sk_flags & (SK_BT_REQBKWD))) || + (ScanDirectionIsBackward(dir) && + (cur->sk_flags & (SK_BT_REQFWD))))) + has_required_opposite_direction_only = + required_opposite_direction_only = true; + } + + /* Optimization: skip over known-satisfied scan keys */ + if (ikey < sktrig) + continue; + + if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) + { + Assert(sktrig_required); + + required = true; + + if (cur->sk_attno > tupnatts) + { + /* Set this just like _bt_tuple_before_array_skeys */ + Assert(sktrig < ikey); + so->scanBehind = true; + } + } + + /* + * Handle a required non-array scan key that the initial call to + * _bt_check_compare indicated triggered array advancement, if any. + * + * The non-array scan key's strategy will be <, <=, or = during a + * forwards scan (or any one of =, >=, or > during a backwards scan). + * It follows that the corresponding tuple attribute's value must now + * be either > or >= the scan key value (for backwards scans it must + * be either < or <= that value). + * + * If this is a required equality strategy scan key, this is just an + * optimization; _bt_tuple_before_array_skeys already confirmed that + * this scan key places us ahead of caller's tuple. There's no need + * to repeat that work now. (The same underlying principle also gets + * applied by the cur_elem_trig optimization used to speed up searches + * for the next array element.) + * + * If this is a required inequality strategy scan key, we _must_ rely + * on _bt_check_compare like this; we aren't capable of directly + * evaluating required inequality strategy scan keys here, on our own. + */ + if (ikey == sktrig && !array) + { + Assert(sktrig_required && required && all_required_satisfied); + + /* Use "beyond end" advancement. See below for an explanation. */ + beyond_end_advance = true; + all_satisfied = all_required_satisfied = false; + + /* + * Set a flag that remembers that this was an inequality required + * in the opposite scan direction only, that nevertheless + * triggered the call here. + * + * This only happens when an inequality operator (which must be + * strict) encounters a group of NULLs that indicate the end of + * non-NULL values for tuples in the current scan direction. + */ + if (unlikely(required_opposite_direction_only)) + oppodir_inequality_sktrig = true; + + continue; + } + + /* + * Nothing more for us to do with an inequality strategy scan key that + * wasn't the one that _bt_check_compare stopped on, though. + * + * Note: if our later call to _bt_check_compare (to recheck caller's + * tuple) sets continuescan=false due to finding this same inequality + * unsatisfied (possible when it's required in the scan direction), + * we'll deal with it via a recursive "second pass" call. + */ + else if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + + /* + * Nothing for us to do with an equality strategy scan key that isn't + * marked required, either -- unless it's a non-required array + */ + else if (!required && !array) + continue; + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose binary search triggered "beyond end of array + * element" array advancement due to encountering a tuple attribute + * value > the closest matching array key (or < for backwards scans). + */ + if (beyond_end_advance) + { + int final_elem_dir; + + if (ScanDirectionIsBackward(dir) || !array) + final_elem_dir = 0; + else + final_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != final_elem_dir) + { + array->cur_elem = final_elem_dir; + cur->sk_argument = array->elem_values[final_elem_dir]; + } + + continue; + } + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose tuple attribute was < the closest matching + * array key when we dealt with it (or > for backwards scans). + * + * This earlier required array key already puts us ahead of caller's + * tuple in the key space (for the current scan direction). We must + * make sure that subsequent lower-order array keys do not put us too + * far ahead (ahead of tuples that have yet to be seen by our caller). + * For example, when a tuple "(a, b) = (42, 5)" advances the array + * keys on "a" from 40 to 45, we must also set "b" to whatever the + * first array element for "b" is. It would be wrong to allow "b" to + * be set based on the tuple value. + * + * Perform the same steps with truncated high key attributes. You can + * think of this as a "binary search" for the element closest to the + * value -inf. Again, the arrays must never get ahead of the scan. + */ + if (!all_required_satisfied || cur->sk_attno > tupnatts) + { + int first_elem_dir; + + if (ScanDirectionIsForward(dir) || !array) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != first_elem_dir) + { + array->cur_elem = first_elem_dir; + cur->sk_argument = array->elem_values[first_elem_dir]; + } + + continue; + } + + /* + * Search in scankey's array for the corresponding tuple attribute + * value from caller's tuple + */ + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + + if (array) + { + bool cur_elem_trig = (sktrig_required && ikey == sktrig); + + /* + * Binary search for closest match that's available from the array + */ + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); + + Assert(set_elem >= 0 && set_elem < array->num_elems); + } + else + { + Assert(sktrig_required && required); + + /* + * This is a required non-array equality strategy scan key, which + * we'll treat as a degenerate single element array. + * + * This scan key's imaginary "array" can't really advance, but it + * can still roll over like any other array. (Actually, this is + * no different to real single value arrays, which never advance + * without rolling over -- they can never truly advance, either.) + */ + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + } + + /* + * Consider "beyond end of array element" array advancement. + * + * When the tuple attribute value is > the closest matching array key + * (or < in the backwards scan case), we need to ratchet this array + * forward (backward) by one increment, so that caller's tuple ends up + * being < final array value instead (or > final array value instead). + * This process has to work for all of the arrays, not just this one: + * it must "carry" to higher-order arrays when the set_elem that we + * just found happens to be the final one for the scan's direction. + * Incrementing (decrementing) set_elem itself isn't good enough. + * + * Our approach is to provisionally use set_elem as if it was an exact + * match now, then set each later/less significant array to whatever + * its final element is. Once outside the loop we'll then "increment + * this array's set_elem" by calling _bt_advance_array_keys_increment. + * That way the process rolls over to higher order arrays as needed. + * + * Under this scheme any required arrays only ever ratchet forwards + * (or backwards), and always do so to the maximum possible extent + * that we can know will be safe without seeing the scan's next tuple. + * We don't need any special handling for required scan keys that lack + * a real array to advance, nor for redundant scan keys that couldn't + * be eliminated by _bt_preprocess_keys. It won't matter if some of + * our "true" array scan keys (or even all of them) are non-required. + */ + if (required && + ((ScanDirectionIsForward(dir) && result > 0) || + (ScanDirectionIsBackward(dir) && result < 0))) + beyond_end_advance = true; + + Assert(all_required_satisfied && all_satisfied); + if (result != 0) + { + /* + * Track whether caller's tuple satisfies our new post-advancement + * qual, for required scan keys, as well as for the entire set of + * interesting scan keys (all required scan keys plus non-required + * array scan keys are considered interesting.) + */ + all_satisfied = false; + if (required) + all_required_satisfied = false; + else + { + /* + * There's no need to advance the arrays using the best + * available match for a non-required array. Give up now. + * (Though note that sktrig_required calls still have to do + * all the usual post-advancement steps, including the recheck + * call to _bt_check_compare.) + */ + break; + } + } + + /* Advance array keys, even when set_elem isn't an exact match */ + if (array && array->cur_elem != set_elem) + { + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; } } /* - * If we changed any keys, we must redo _bt_preprocess_keys. That might - * sound like overkill, but in cases with multiple keys per index column - * it seems necessary to do the full set of pushups. - * - * Also do this whenever the scan's set of array keys "wrapped around" at - * the end of the last primitive index scan. There won't have been a call - * to _bt_preprocess_keys from some other place following wrap around, so - * we do it for ourselves. + * Advance the array keys incrementally whenever "beyond end of array + * element" array advancement happens, so that advancement will carry to + * higher-order arrays (might exhaust all the scan's arrays instead, which + * ends the top-level scan). */ - if (changed || !so->arraysStarted) - { - _bt_preprocess_keys(scan); - /* The mark should have been set on a consistent set of keys... */ - Assert(so->qual_ok); - } -} + if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir)) + goto end_toplevel_scan; + Assert(_bt_verify_keys_with_arraykeys(scan)); + + /* + * Does tuple now satisfy our new qual? Recheck with _bt_check_compare. + * + * Calls triggered by an unsatisfied required scan key, whose tuple now + * satisfies all required scan keys, but not all nonrequired array keys, + * will still require a recheck call to _bt_check_compare. They'll still + * need its "second pass" handling of required inequality scan keys. + * (Might have missed a still-unsatisfied required inequality scan key + * that caller didn't detect as the sktrig scan key during its initial + * _bt_check_compare call that used the old/original qual.) + * + * Calls triggered by an unsatisfied nonrequired array scan key never need + * "second pass" handling of required inequalities (nor any other handling + * of any required scan key). All that matters is whether caller's tuple + * satisfies the new qual, so it's safe to just skip the _bt_check_compare + * recheck when we've already determined that it can only return 'false'. + */ + if ((sktrig_required && all_required_satisfied) || + (!sktrig_required && all_satisfied)) + { + int nsktrig = sktrig + 1; + bool continuescan; + + Assert(all_required_satisfied); + + /* Recheck _bt_check_compare on behalf of caller */ + if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, + false, false, false, + &continuescan, &nsktrig) && + !so->scanBehind) + { + /* This tuple satisfies the new qual */ + Assert(all_satisfied && continuescan); + + if (pstate) + pstate->continuescan = true; + + return true; + } + + /* + * Consider "second pass" handling of required inequalities. + * + * It's possible that our _bt_check_compare call indicated that the + * scan should end due to some unsatisfied inequality that wasn't + * initially recognized as such by us. Handle this by calling + * ourselves recursively, this time indicating that the trigger is the + * inequality that we missed first time around (and using a set of + * required array/equality keys that are now exact matches for tuple). + * + * We make a strong, general guarantee that every _bt_checkkeys call + * here will advance the array keys to the maximum possible extent + * that we can know to be safe based on caller's tuple alone. If we + * didn't perform this step, then that guarantee wouldn't quite hold. + */ + if (unlikely(!continuescan)) + { + bool satisfied PG_USED_FOR_ASSERTS_ONLY; + + Assert(sktrig_required); + Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber); + + /* + * The tuple must use "beyond end" advancement during the + * recursive call, so we cannot possibly end up back here when + * recursing. We'll consume a small, fixed amount of stack space. + */ + Assert(!beyond_end_advance); + + /* Advance the array keys a second time using same tuple */ + satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts, + tupdesc, nsktrig, true); + + /* This tuple doesn't satisfy the inequality */ + Assert(!satisfied); + return false; + } + + /* + * Some non-required scan key (from new qual) still not satisfied. + * + * All scan keys required in the current scan direction must still be + * satisfied, though, so we can trust all_required_satisfied below. + */ + } + + /* + * When we were called just to deal with "advancing" non-required arrays, + * this is as far as we can go (cannot stop the scan for these callers) + */ + if (!sktrig_required) + { + /* Caller's tuple doesn't match any qual */ + return false; + } + + /* + * Postcondition array state assertion (for still-unsatisfied tuples). + * + * By here we have established that the scan's required arrays (scan must + * have at least one required array) advanced, without becoming exhausted. + * + * Caller's tuple is now < the newly advanced array keys (or > when this + * is a backwards scan), except in the case where we only got this far due + * to an unsatisfied non-required scan key. Verify that with an assert. + * + * Note: we don't just quit at this point when all required scan keys were + * found to be satisfied because we need to consider edge-cases involving + * scan keys required in the opposite direction only; those aren't tracked + * by all_required_satisfied. (Actually, oppodir_inequality_sktrig trigger + * scan keys are tracked by all_required_satisfied, since it's convenient + * for _bt_check_compare to behave as if they are required in the current + * scan direction to deal with NULLs. We'll account for that separately.) + */ + Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, + false, 0, NULL) == + !all_required_satisfied); + + /* + * We generally permit primitive index scans to continue onto the next + * sibling page when the page's finaltup satisfies all required scan keys + * at the point where we're between pages. + * + * If caller's tuple is also the page's finaltup, and we see that required + * scan keys still aren't satisfied, start a new primitive index scan. + */ + if (!all_required_satisfied && pstate->finaltup == tuple) + goto new_prim_scan; + + /* + * Proactively check finaltup (don't wait until finaltup is reached by the + * scan) when it might well turn out to not be satisfied later on. + * + * Note: if so->scanBehind hasn't already been set for finaltup by us, + * it'll be set during this call to _bt_tuple_before_array_skeys. Either + * way, it'll be set correctly (for the whole page) after this point. + */ + if (!all_required_satisfied && pstate->finaltup && + _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, + BTreeTupleGetNAtts(pstate->finaltup, rel), + false, 0, &so->scanBehind)) + goto new_prim_scan; + + /* + * When we encounter a truncated finaltup high key attribute, we're + * optimistic about the chances of its corresponding required scan key + * being satisfied when we go on to check it against tuples from this + * page's right sibling leaf page. We consider truncated attributes to be + * satisfied by required scan keys, which allows the primitive index scan + * to continue to the next leaf page. We must set so->scanBehind to true + * to remember that the last page's finaltup had "satisfied" required scan + * keys for one or more truncated attribute values (scan keys required in + * _either_ scan direction). + * + * There is a chance that _bt_checkkeys (which checks so->scanBehind) will + * find that even the sibling leaf page's finaltup is < the new array + * keys. When that happens, our optimistic policy will have incurred a + * single extra leaf page access that could have been avoided. + * + * A pessimistic policy would give backward scans a gratuitous advantage + * over forward scans. We'd punish forward scans for applying more + * accurate information from the high key, rather than just using the + * final non-pivot tuple as finaltup, in the style of backward scans. + * Being pessimistic would also give some scans with non-required arrays a + * perverse advantage over similar scans that use required arrays instead. + * + * You can think of this as a speculative bet on what the scan is likely + * to find on the next page. It's not much of a gamble, though, since the + * untruncated prefix of attributes must strictly satisfy the new qual + * (though it's okay if any non-required scan keys fail to be satisfied). + */ + if (so->scanBehind && has_required_opposite_direction_only) + { + /* + * However, we avoid this behavior whenever the scan involves a scan + * key required in the opposite direction to the scan only, along with + * a finaltup with at least one truncated attribute that's associated + * with a scan key marked required (required in either direction). + * + * _bt_check_compare simply won't stop the scan for a scan key that's + * marked required in the opposite scan direction only. That leaves + * us without any reliable way of reconsidering any opposite-direction + * inequalities if it turns out that starting a new primitive index + * scan will allow _bt_first to skip ahead by a great many leaf pages + * (see next section for details of how that works). + */ + goto new_prim_scan; + } + + /* + * Handle inequalities marked required in the opposite scan direction. + * They can also signal that we should start a new primitive index scan. + * + * It's possible that the scan is now positioned where "matching" tuples + * begin, and that caller's tuple satisfies all scan keys required in the + * current scan direction. But if caller's tuple still doesn't satisfy + * other scan keys that are required in the opposite scan direction only + * (e.g., a required >= strategy scan key when scan direction is forward), + * it's still possible that there are many leaf pages before the page that + * _bt_first could skip straight to. Groveling through all those pages + * will always give correct answers, but it can be very inefficient. We + * must avoid needlessly scanning extra pages. + * + * Separately, it's possible that _bt_check_compare set continuescan=false + * for a scan key that's required in the opposite direction only. This is + * a special case, that happens only when _bt_check_compare sees that the + * inequality encountered a NULL value. This signals the end of non-NULL + * values in the current scan direction, which is reason enough to end the + * (primitive) scan. If this happens at the start of a large group of + * NULL values, then we shouldn't expect to be called again until after + * the scan has already read indefinitely-many leaf pages full of tuples + * with NULL suffix values. We need a separate test for this case so that + * we don't miss our only opportunity to skip over such a group of pages. + * (_bt_first is expected to skip over the group of NULLs by applying a + * similar "deduce NOT NULL" rule, where it finishes its insertion scan + * key by consing up an explicit SK_SEARCHNOTNULL key.) + * + * Apply a test against finaltup to detect and recover from these problem: + * if even finaltup doesn't satisfy such an inequality, we just skip by + * starting a new primitive index scan. When we skip, we know for sure + * that all of the tuples on the current page following caller's tuple are + * also before the _bt_first-wise start of tuples for our new qual. That + * at least suggests many more skippable pages beyond the current page. + */ + if (has_required_opposite_direction_only && pstate->finaltup && + (all_required_satisfied || oppodir_inequality_sktrig)) + { + int nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel); + ScanDirection flipped; + bool continuescanflip; + int opsktrig; + + /* + * We're checking finaltup (which is usually not caller's tuple), so + * cannot reuse work from caller's earlier _bt_check_compare call. + * + * Flip the scan direction when calling _bt_check_compare this time, + * so that it will set continuescanflip=false when it encounters an + * inequality required in the opposite scan direction. + */ + Assert(!so->scanBehind); + opsktrig = 0; + flipped = -dir; + _bt_check_compare(scan, flipped, + pstate->finaltup, nfinaltupatts, tupdesc, + false, false, false, + &continuescanflip, &opsktrig); + + /* + * If we ended up here due to the all_required_satisfied criteria, + * test opsktrig in a way that ensures that finaltup contains the same + * prefix of key columns as caller's tuple (a prefix that satisfies + * earlier required-in-current-direction scan keys). + * + * If we ended up here due to the oppodir_inequality_sktrig criteria, + * test opsktrig in a way that ensures that the same scan key that our + * caller found to be unsatisfied (by the scan's tuple) was also the + * one unsatisfied just now (by finaltup). That way we'll only start + * a new primitive scan when we're sure that both tuples _don't_ share + * the same prefix of satisfied equality-constrained attribute values, + * and that finaltup has a non-NULL attribute value indicated by the + * unsatisfied scan key at offset opsktrig/sktrig. (This depends on + * _bt_check_compare not caring about the direction that inequalities + * are required in whenever NULL attribute values are unsatisfied. It + * only cares about the scan direction, and its relationship to + * whether NULLs are stored first or last relative to non-NULLs.) + */ + Assert(all_required_satisfied != oppodir_inequality_sktrig); + if (unlikely(!continuescanflip && + ((all_required_satisfied && opsktrig > sktrig) || + (oppodir_inequality_sktrig && opsktrig >= sktrig)))) + { + Assert(so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber); + + /* + * Make sure that any non-required arrays are set to the first + * array element for the current scan direction + */ + _bt_rewind_nonrequired_arrays(scan, dir); + + goto new_prim_scan; + } + } + + /* + * Stick with the ongoing primitive index scan for now. + * + * It's possible that later tuples will also turn out to have values that + * are still < the now-current array keys (or > the current array keys). + * Our caller will handle this by performing what amounts to a linear + * search of the page, implemented by calling _bt_check_compare and then + * _bt_tuple_before_array_skeys for each tuple. + * + * This approach has various advantages over a binary search of the page. + * Repeated binary searches of the page (one binary search for every array + * advancement) won't outperform a continuous linear search. While there + * are workloads that a naive linear search won't handle well, our caller + * has a "look ahead" fallback mechanism to deal with that problem. + */ + pstate->continuescan = true; /* Override _bt_check_compare */ + so->needPrimScan = false; /* _bt_readpage has more tuples to check */ + + if (so->scanBehind) + { + /* Optimization: skip by setting "look ahead" mechanism's offnum */ + Assert(ScanDirectionIsForward(dir)); + pstate->skip = pstate->maxoff + 1; + } + + /* Caller's tuple doesn't match the new qual */ + return false; + +new_prim_scan: + + /* + * End this primitive index scan, but schedule another. + * + * Note: If the scan direction happens to change, this scheduled primitive + * index scan won't go ahead after all. + */ + pstate->continuescan = false; /* Tell _bt_readpage we're done... */ + so->needPrimScan = true; /* ...but call _bt_first again */ + + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page); + + /* Caller's tuple doesn't match the new qual */ + return false; + +end_toplevel_scan: + + /* + * End the current primitive index scan, but don't schedule another. + * + * This ends the entire top-level scan in the current scan direction. + * + * Note: The scan's arrays (including any non-required arrays) are now in + * their final positions for the current scan direction. If the scan + * direction happens to change, then the arrays will already be in their + * first positions for what will then be the current scan direction. + */ + pstate->continuescan = false; /* Tell _bt_readpage we're done... */ + so->needPrimScan = false; /* ...don't call _bt_first again, though */ + + /* Caller's tuple doesn't match any qual */ + return false; +} /* * _bt_preprocess_keys() -- Preprocess scan keys * - * The given search-type keys (in scan->keyData[] or so->arrayKeyData[]) + * The given search-type keys (taken from scan->keyData[]) * are copied to so->keyData[] with possible transformation. * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets * the number of output keys (possibly less, never greater). @@ -690,8 +2485,9 @@ _bt_restore_array_keys(IndexScanDesc scan) * The output keys must be sorted by index attribute. Presently we expect * (but verify) that the input keys are already so sorted --- this is done * by match_clauses_to_index() in indxpath.c. Some reordering of the keys - * within each attribute may be done as a byproduct of the processing here, - * but no other code depends on that. + * within each attribute may be done as a byproduct of the processing here. + * That process must leave array scan keys (within an attribute) in the same + * order as corresponding entries from the scan's BTArrayKeyInfo array info. * * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD * if they must be satisfied in order to continue the scan forward or backward @@ -748,8 +2544,8 @@ _bt_restore_array_keys(IndexScanDesc scan) * * Note: the reason we have to copy the preprocessed scan keys into private * storage is that we are modifying the array based on comparisons of the - * key argument values, which could change on a rescan or after moving to - * new elements of array keys. Therefore we can't overwrite the source data. + * key argument values, which could change on a rescan. Therefore we can't + * overwrite the source data. */ void _bt_preprocess_keys(IndexScanDesc scan) @@ -762,11 +2558,31 @@ _bt_preprocess_keys(IndexScanDesc scan) ScanKey inkeys; ScanKey outkeys; ScanKey cur; - ScanKey xform[BTMaxStrategyNumber]; + BTScanKeyPreproc xform[BTMaxStrategyNumber]; bool test_result; int i, j; AttrNumber attno; + ScanKey arrayKeyData; + int *keyDataMap = NULL; + int arrayidx = 0; + + /* + * We're called at the start of each primitive index scan during scans + * that use equality array keys. We can just reuse the scan keys that + * were output at the start of the scan's first primitive index scan. + */ + if (so->numberOfKeys > 0) + { + /* + * An earlier call to _bt_advance_array_keys already set everything up + * already. Just assert that the scan's existing output scan keys are + * consistent with its current array elements. + */ + Assert(so->numArrayKeys); + Assert(_bt_verify_keys_with_arraykeys(scan)); + return; + } /* initialize result variables */ so->qual_ok = true; @@ -775,11 +2591,27 @@ _bt_preprocess_keys(IndexScanDesc scan) if (numberOfKeys < 1) return; /* done if qual-less scan */ + /* If any keys are SK_SEARCHARRAY type, set up array-key info */ + arrayKeyData = _bt_preprocess_array_keys(scan); + if (!so->qual_ok) + { + /* unmatchable array, so give up */ + return; + } + /* - * Read so->arrayKeyData if array keys are present, else scan->keyData + * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[]) + * as our input if _bt_preprocess_array_keys just allocated it, else just + * use scan->keyData[] */ - if (so->arrayKeyData != NULL) - inkeys = so->arrayKeyData; + if (arrayKeyData) + { + inkeys = arrayKeyData; + + /* Also maintain keyDataMap for remapping so->orderProc[] later */ + keyDataMap = MemoryContextAlloc(so->arrayContext, + numberOfKeys * sizeof(int)); + } else inkeys = scan->keyData; @@ -800,6 +2632,19 @@ _bt_preprocess_keys(IndexScanDesc scan) /* We can mark the qual as required if it's for first index col */ if (cur->sk_attno == 1) _bt_mark_scankey_required(outkeys); + if (arrayKeyData) + { + /* + * Don't call _bt_preprocess_array_keys_final in this fast path + * (we'll miss out on the single value array transformation, but + * that's not nearly as important when there's only one scan key) + */ + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_strategy != BTEqualStrategyNumber || + (so->arrayKeys[0].scan_key == 0 && + OidIsValid(so->orderProcs[0].fn_oid))); + } + return; } @@ -859,13 +2704,29 @@ _bt_preprocess_keys(IndexScanDesc scan) * check, and we've rejected any combination of it with a regular * equality condition; but not with other types of conditions. */ - if (xform[BTEqualStrategyNumber - 1]) + if (xform[BTEqualStrategyNumber - 1].skey) { - ScanKey eq = xform[BTEqualStrategyNumber - 1]; + ScanKey eq = xform[BTEqualStrategyNumber - 1].skey; + BTArrayKeyInfo *array = NULL; + FmgrInfo *orderproc = NULL; + + if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY)) + { + int eq_in_ikey, + eq_arrayidx; + + eq_in_ikey = xform[BTEqualStrategyNumber - 1].ikey; + eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx; + array = &so->arrayKeys[eq_arrayidx - 1]; + orderproc = so->orderProcs + eq_in_ikey; + + Assert(array->scan_key == eq_in_ikey); + Assert(OidIsValid(orderproc->fn_oid)); + } for (j = BTMaxStrategyNumber; --j >= 0;) { - ScanKey chk = xform[j]; + ScanKey chk = xform[j].skey; if (!chk || j == (BTEqualStrategyNumber - 1)) continue; @@ -878,6 +2739,7 @@ _bt_preprocess_keys(IndexScanDesc scan) } if (_bt_compare_scankey_args(scan, chk, eq, chk, + array, orderproc, &test_result)) { if (!test_result) @@ -887,7 +2749,9 @@ _bt_preprocess_keys(IndexScanDesc scan) return; } /* else discard the redundant non-equality key */ - xform[j] = NULL; + Assert(!array || array->num_elems > 0); + xform[j].skey = NULL; + xform[j].ikey = -1; } /* else, cannot determine redundancy, keep both keys */ } @@ -896,36 +2760,36 @@ _bt_preprocess_keys(IndexScanDesc scan) } /* try to keep only one of <, <= */ - if (xform[BTLessStrategyNumber - 1] - && xform[BTLessEqualStrategyNumber - 1]) + if (xform[BTLessStrategyNumber - 1].skey + && xform[BTLessEqualStrategyNumber - 1].skey) { - ScanKey lt = xform[BTLessStrategyNumber - 1]; - ScanKey le = xform[BTLessEqualStrategyNumber - 1]; + ScanKey lt = xform[BTLessStrategyNumber - 1].skey; + ScanKey le = xform[BTLessEqualStrategyNumber - 1].skey; - if (_bt_compare_scankey_args(scan, le, lt, le, + if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL, &test_result)) { if (test_result) - xform[BTLessEqualStrategyNumber - 1] = NULL; + xform[BTLessEqualStrategyNumber - 1].skey = NULL; else - xform[BTLessStrategyNumber - 1] = NULL; + xform[BTLessStrategyNumber - 1].skey = NULL; } } /* try to keep only one of >, >= */ - if (xform[BTGreaterStrategyNumber - 1] - && xform[BTGreaterEqualStrategyNumber - 1]) + if (xform[BTGreaterStrategyNumber - 1].skey + && xform[BTGreaterEqualStrategyNumber - 1].skey) { - ScanKey gt = xform[BTGreaterStrategyNumber - 1]; - ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1]; + ScanKey gt = xform[BTGreaterStrategyNumber - 1].skey; + ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1].skey; - if (_bt_compare_scankey_args(scan, ge, gt, ge, + if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL, &test_result)) { if (test_result) - xform[BTGreaterEqualStrategyNumber - 1] = NULL; + xform[BTGreaterEqualStrategyNumber - 1].skey = NULL; else - xform[BTGreaterStrategyNumber - 1] = NULL; + xform[BTGreaterStrategyNumber - 1].skey = NULL; } } @@ -936,11 +2800,13 @@ _bt_preprocess_keys(IndexScanDesc scan) */ for (j = BTMaxStrategyNumber; --j >= 0;) { - if (xform[j]) + if (xform[j].skey) { ScanKey outkey = &outkeys[new_numberOfKeys++]; - memcpy(outkey, xform[j], sizeof(ScanKeyData)); + memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); + if (arrayKeyData) + keyDataMap[new_numberOfKeys - 1] = xform[j].ikey; if (priorNumberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); } @@ -966,6 +2832,8 @@ _bt_preprocess_keys(IndexScanDesc scan) ScanKey outkey = &outkeys[new_numberOfKeys++]; memcpy(outkey, cur, sizeof(ScanKeyData)); + if (arrayKeyData) + keyDataMap[new_numberOfKeys - 1] = i; if (numberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); @@ -977,20 +2845,112 @@ _bt_preprocess_keys(IndexScanDesc scan) continue; } - /* have we seen one of these before? */ - if (xform[j] == NULL) + /* + * Does this input scan key require further processing as an array? + */ + if (cur->sk_strategy == InvalidStrategy) { - /* nope, so remember this scankey */ - xform[j] = cur; + /* _bt_preprocess_array_keys marked this array key redundant */ + Assert(arrayKeyData); + Assert(cur->sk_flags & SK_SEARCHARRAY); + continue; + } + + if (cur->sk_strategy == BTEqualStrategyNumber && + (cur->sk_flags & SK_SEARCHARRAY)) + { + /* _bt_preprocess_array_keys kept this array key */ + Assert(arrayKeyData); + arrayidx++; + } + + /* + * have we seen a scan key for this same attribute and using this same + * operator strategy before now? + */ + if (xform[j].skey == NULL) + { + /* nope, so this scan key wins by default (at least for now) */ + xform[j].skey = cur; + xform[j].ikey = i; + xform[j].arrayidx = arrayidx; } else { - /* yup, keep only the more restrictive key */ - if (_bt_compare_scankey_args(scan, cur, cur, xform[j], - &test_result)) + FmgrInfo *orderproc = NULL; + BTArrayKeyInfo *array = NULL; + + /* + * Seen one of these before, so keep only the more restrictive key + * if possible + */ + if (j == (BTEqualStrategyNumber - 1) && arrayKeyData) { + /* + * Have to set up array keys + */ + if ((cur->sk_flags & SK_SEARCHARRAY)) + { + array = &so->arrayKeys[arrayidx - 1]; + orderproc = so->orderProcs + i; + + Assert(array->scan_key == i); + Assert(OidIsValid(orderproc->fn_oid)); + } + else if ((xform[j].skey->sk_flags & SK_SEARCHARRAY)) + { + array = &so->arrayKeys[xform[j].arrayidx - 1]; + orderproc = so->orderProcs + xform[j].ikey; + + Assert(array->scan_key == xform[j].ikey); + Assert(OidIsValid(orderproc->fn_oid)); + } + + /* + * Both scan keys might have arrays, in which case we'll + * arbitrarily pass only one of the arrays. That won't + * matter, since _bt_compare_scankey_args is aware that two + * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys + * failed to eliminate redundant arrays through array merging. + * _bt_compare_scankey_args just returns false when it sees + * this; it won't even try to examine either array. + */ + } + + if (_bt_compare_scankey_args(scan, cur, cur, xform[j].skey, + array, orderproc, &test_result)) + { + /* Have all we need to determine redundancy */ if (test_result) - xform[j] = cur; + { + Assert(!array || array->num_elems > 0); + + /* + * New key is more restrictive, and so replaces old key... + */ + if (j != (BTEqualStrategyNumber - 1) || + !(xform[j].skey->sk_flags & SK_SEARCHARRAY)) + { + Assert(!array || array->scan_key == i); + xform[j].skey = cur; + xform[j].ikey = i; + xform[j].arrayidx = arrayidx; + } + else + { + /* + * ...unless we have to keep the old key because it's + * an array that rendered the new key redundant. We + * need to make sure that we don't throw away an array + * scan key. _bt_compare_scankey_args expects us to + * always keep arrays (and discard non-arrays). + */ + Assert(j == (BTEqualStrategyNumber - 1)); + Assert(xform[j].skey->sk_flags & SK_SEARCHARRAY); + Assert(xform[j].ikey == array->scan_key); + Assert(!(cur->sk_flags & SK_SEARCHARRAY)); + } + } else if (j == (BTEqualStrategyNumber - 1)) { /* key == a && key == b, but a != b */ @@ -1002,22 +2962,130 @@ _bt_preprocess_keys(IndexScanDesc scan) else { /* - * We can't determine which key is more restrictive. Keep the - * previous one in xform[j] and push this one directly to the - * output array. + * We can't determine which key is more restrictive. Push + * xform[j] directly to the output array, then set xform[j] to + * the new scan key. + * + * Note: We do things this way around so that our arrays are + * always in the same order as their corresponding scan keys, + * even with incomplete opfamilies. _bt_advance_array_keys + * depends on this. */ ScanKey outkey = &outkeys[new_numberOfKeys++]; - memcpy(outkey, cur, sizeof(ScanKeyData)); + memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); + if (arrayKeyData) + keyDataMap[new_numberOfKeys - 1] = xform[j].ikey; if (numberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); + xform[j].skey = cur; + xform[j].ikey = i; + xform[j].arrayidx = arrayidx; } } } so->numberOfKeys = new_numberOfKeys; + + /* + * Now that we've built a temporary mapping from so->keyData[] (output + * scan keys) to scan->keyData[] (input scan keys), fix array->scan_key + * references. Also consolidate the so->orderProc[] array such that it + * can be subscripted using so->keyData[]-wise offsets. + */ + if (arrayKeyData) + _bt_preprocess_array_keys_final(scan, keyDataMap); + + /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */ } +#ifdef USE_ASSERT_CHECKING +/* + * Verify that the scan's qual state matches what we expect at the point that + * _bt_start_prim_scan is about to start a just-scheduled new primitive scan. + * + * We enforce a rule against non-required array scan keys: they must start out + * with whatever element is the first for the scan's current scan direction. + * See _bt_rewind_nonrequired_arrays comments for an explanation. + */ +static bool +_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int arrayidx = 0; + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + int first_elem_dir; + + if (!(cur->sk_flags & SK_SEARCHARRAY) || + cur->sk_strategy != BTEqualStrategyNumber) + continue; + + array = &so->arrayKeys[arrayidx++]; + + if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + continue; + + if (ScanDirectionIsForward(dir)) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array->cur_elem != first_elem_dir) + return false; + } + + return _bt_verify_keys_with_arraykeys(scan); +} + +/* + * Verify that the scan's "so->keyData[]" scan keys are in agreement with + * its array key state + */ +static bool +_bt_verify_keys_with_arraykeys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int last_sk_attno = InvalidAttrNumber, + arrayidx = 0; + + if (!so->qual_ok) + return false; + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array; + + if (cur->sk_strategy != BTEqualStrategyNumber || + !(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + array = &so->arrayKeys[arrayidx++]; + if (array->scan_key != ikey) + return false; + + if (array->num_elems <= 0) + return false; + + if (cur->sk_argument != array->elem_values[array->cur_elem]) + return false; + if (last_sk_attno > cur->sk_attno) + return false; + last_sk_attno = cur->sk_attno; + } + + if (arrayidx != so->numArrayKeys) + return false; + + return true; +} +#endif + /* * Compare two scankey values using a specified operator. * @@ -1033,9 +3101,24 @@ _bt_preprocess_keys(IndexScanDesc scan) * we store the operator result in *result and return true. We return false * if the comparison could not be made. * + * If either leftarg or rightarg are an array, we'll apply array-specific + * rules to determine which array elements are redundant on behalf of caller. + * It is up to our caller to save whichever of the two scan keys is the array, + * and discard the non-array scan key (the non-array scan key is guaranteed to + * be redundant with any complete opfamily). Caller isn't expected to call + * here with a pair of array scan keys provided we're dealing with a complete + * opfamily (_bt_preprocess_array_keys will merge array keys together to make + * sure of that). + * + * Note: we'll also shrink caller's array as needed to eliminate redundant + * array elements. One reason why caller should prefer to discard non-array + * scan keys is so that we'll have the opportunity to shrink the array + * multiple times, in multiple calls (for each of several other scan keys on + * the same index attribute). + * * Note: op always points at the same ScanKey as either leftarg or rightarg. - * Since we don't scribble on the scankeys, this aliasing should cause no - * trouble. + * Since we don't scribble on the scankeys themselves, this aliasing should + * cause no trouble. * * Note: this routine needs to be insensitive to any DESC option applied * to the index column. For example, "x < 4" is a tighter constraint than @@ -1044,6 +3127,7 @@ _bt_preprocess_keys(IndexScanDesc scan) static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, + BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result) { Relation rel = scan->indexRelation; @@ -1112,6 +3196,48 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, return true; } + /* + * If either leftarg or rightarg are equality-type array scankeys, we need + * specialized handling (since by now we know that IS NULL wasn't used) + */ + if (array) + { + bool leftarray, + rightarray; + + leftarray = ((leftarg->sk_flags & SK_SEARCHARRAY) && + leftarg->sk_strategy == BTEqualStrategyNumber); + rightarray = ((rightarg->sk_flags & SK_SEARCHARRAY) && + rightarg->sk_strategy == BTEqualStrategyNumber); + + /* + * _bt_preprocess_array_keys is responsible for merging together array + * scan keys, and will do so whenever the opfamily has the required + * cross-type support. If it failed to do that, we handle it just + * like the case where we can't make the comparison ourselves. + */ + if (leftarray && rightarray) + { + /* Can't make the comparison */ + *result = false; /* suppress compiler warnings */ + return false; + } + + /* + * Otherwise we need to determine if either one of leftarg or rightarg + * uses an array, then pass this through to a dedicated helper + * function. + */ + if (leftarray) + return _bt_compare_array_scankey_args(scan, leftarg, rightarg, + orderproc, array, result); + else if (rightarray) + return _bt_compare_array_scankey_args(scan, rightarg, leftarg, + orderproc, array, result); + + /* FALL THRU */ + } + /* * The opfamily we need to worry about is identified by the index column. */ @@ -1351,60 +3477,234 @@ _bt_mark_scankey_required(ScanKey skey) * * Return true if so, false if not. If the tuple fails to pass the qual, * we also determine whether there's any need to continue the scan beyond - * this tuple, and set *continuescan accordingly. See comments for + * this tuple, and set pstate.continuescan accordingly. See comments for * _bt_preprocess_keys(), above, about how this is done. * * Forward scan callers can pass a high key tuple in the hopes of having * us set *continuescan to false, and avoiding an unnecessary visit to * the page to the right. * + * Advances the scan's array keys when necessary for arrayKeys=true callers. + * Caller can avoid all array related side-effects when calling just to do a + * page continuescan precheck -- pass arrayKeys=false for that. Scans without + * any arrays keys must always pass arrayKeys=false. + * + * Also stops and starts primitive index scans for arrayKeys=true callers. + * Scans with array keys are required to set up page state that helps us with + * this. The page's finaltup tuple (the page high key for a forward scan, or + * the page's first non-pivot tuple for a backward scan) must be set in + * pstate.finaltup ahead of the first call here for the page (or possibly the + * first call after an initial continuescan-setting page precheck call). Set + * this to NULL for rightmost page (or the leftmost page for backwards scans). + * * scan: index scan descriptor (containing a search-type scankey) + * pstate: page level input and output parameters + * arrayKeys: should we advance the scan's array keys if necessary? * tuple: index tuple to test * tupnatts: number of attributes in tupnatts (high key may be truncated) - * dir: direction we are scanning in - * continuescan: output parameter (will be set correctly in all cases) - * continuescanPrechecked: indicates that *continuescan flag is known to - * be true for the last item on the page - * haveFirstMatch: indicates that we already have at least one match - * in the current page */ bool -_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, - ScanDirection dir, bool *continuescan, - bool continuescanPrechecked, bool haveFirstMatch) +_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, + IndexTuple tuple, int tupnatts) { - TupleDesc tupdesc; - BTScanOpaque so; - int keysz; - int ikey; - ScanKey key; + TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanDirection dir = pstate->dir; + int ikey = 0; + bool res; Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); - *continuescan = true; /* default assumption */ + res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, + arrayKeys, pstate->prechecked, pstate->firstmatch, + &pstate->continuescan, &ikey); - tupdesc = RelationGetDescr(scan->indexRelation); - so = (BTScanOpaque) scan->opaque; - keysz = so->numberOfKeys; - - for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) +#ifdef USE_ASSERT_CHECKING + if (!arrayKeys && so->numArrayKeys) { - Datum datum; - bool isNull; - Datum test; - bool requiredSameDir = false, - requiredOppositeDir = false; + /* + * This is a continuescan precheck call for a scan with array keys. + * + * Assert that the scan isn't in danger of becoming confused. + */ + Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch); + Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, + tupnatts, false, 0, NULL)); + } + if (pstate->prechecked || pstate->firstmatch) + { + bool dcontinuescan; + int dikey = 0; /* - * Check if the key is required for ordered scan in the same or - * opposite direction. Save as flag variables for future usage. + * Call relied on continuescan/firstmatch prechecks -- assert that we + * get the same answer without those optimizations + */ + Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, + false, false, false, + &dcontinuescan, &dikey)); + Assert(pstate->continuescan == dcontinuescan); + } +#endif + + /* + * Only one _bt_check_compare call is required in the common case where + * there are no equality strategy array scan keys. Otherwise we can only + * accept _bt_check_compare's answer unreservedly when it didn't set + * pstate.continuescan=false. + */ + if (!arrayKeys || pstate->continuescan) + return res; + + /* + * _bt_check_compare call set continuescan=false in the presence of + * equality type array keys. This could mean that the tuple is just past + * the end of matches for the current array keys. + * + * It's also possible that the scan is still _before_ the _start_ of + * tuples matching the current set of array keys. Check for that first. + */ + if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, + ikey, NULL)) + { + /* + * Tuple is still before the start of matches according to the scan's + * required array keys (according to _all_ of its required equality + * strategy keys, actually). + * + * _bt_advance_array_keys occasionally sets so->scanBehind to signal + * that the scan's current position/tuples might be significantly + * behind (multiple pages behind) its current array keys. When this + * happens, we need to be prepared to recover by starting a new + * primitive index scan here, on our own. + */ + Assert(!so->scanBehind || + so->keyData[ikey].sk_strategy == BTEqualStrategyNumber); + if (unlikely(so->scanBehind) && pstate->finaltup && + _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, + BTreeTupleGetNAtts(pstate->finaltup, + scan->indexRelation), + false, 0, NULL)) + { + /* Cut our losses -- start a new primitive index scan now */ + pstate->continuescan = false; + so->needPrimScan = true; + } + else + { + /* Override _bt_check_compare, continue primitive scan */ + pstate->continuescan = true; + + /* + * We will end up here repeatedly given a group of tuples > the + * previous array keys and < the now-current keys (for a backwards + * scan it's just the same, though the operators swap positions). + * + * We must avoid allowing this linear search process to scan very + * many tuples from well before the start of tuples matching the + * current array keys (or from well before the point where we'll + * once again have to advance the scan's array keys). + * + * We keep the overhead under control by speculatively "looking + * ahead" to later still-unscanned items from this same leaf page. + * We'll only attempt this once the number of tuples that the + * linear search process has examined starts to get out of hand. + */ + pstate->rechecks++; + if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS) + { + /* See if we should skip ahead within the current leaf page */ + _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc); + + /* + * Might have set pstate.skip to a later page offset. When + * that happens then _bt_readpage caller will inexpensively + * skip ahead to a later tuple from the same page (the one + * just after the tuple we successfully "looked ahead" to). + */ + } + } + + /* This indextuple doesn't match the current qual, in any case */ + return false; + } + + /* + * Caller's tuple is >= the current set of array keys and other equality + * constraint scan keys (or <= if this is a backwards scan). It's now + * clear that we _must_ advance any required array keys in lockstep with + * the scan. + */ + return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc, + ikey, true); +} + +/* + * Test whether an indextuple satisfies current scan condition. + * + * Return true if so, false if not. If not, also sets *continuescan to false + * when it's also not possible for any later tuples to pass the current qual + * (with the scan's current set of array keys, in the current scan direction), + * in addition to setting *ikey to the so->keyData[] subscript/offset for the + * unsatisfied scan key (needed when caller must consider advancing the scan's + * array keys). + * + * This is a subroutine for _bt_checkkeys. We provisionally assume that + * reaching the end of the current set of required keys (in particular the + * current required array keys) ends the ongoing (primitive) index scan. + * Callers without array keys should just end the scan right away when they + * find that continuescan has been set to false here by us. Things are more + * complicated for callers with array keys. + * + * Callers with array keys must first consider advancing the arrays when + * continuescan has been set to false here by us. They must then consider if + * it really does make sense to end the current (primitive) index scan, in + * light of everything that is known at that point. (In general when we set + * continuescan=false for these callers it must be treated as provisional.) + * + * We deal with advancing unsatisfied non-required arrays directly, though. + * This is safe, since by definition non-required keys can't end the scan. + * This is just how we determine if non-required arrays are just unsatisfied + * by the current array key, or if they're truly unsatisfied (that is, if + * they're unsatisfied by every possible array key). + * + * Though we advance non-required array keys on our own, that shouldn't have + * any lasting consequences for the scan. By definition, non-required arrays + * have no fixed relationship with the scan's progress. (There are delicate + * considerations for non-required arrays when the arrays need to be advanced + * following our setting continuescan to false, but that doesn't concern us.) + * + * Pass advancenonrequired=false to avoid all array related side effects. + * This allows _bt_advance_array_keys caller to avoid infinite recursion. + */ +static bool +_bt_check_compare(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool advancenonrequired, bool prechecked, bool firstmatch, + bool *continuescan, int *ikey) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + *continuescan = true; /* default assumption */ + + for (; *ikey < so->numberOfKeys; (*ikey)++) + { + ScanKey key = so->keyData + *ikey; + Datum datum; + bool isNull; + bool requiredSameDir = false, + requiredOppositeDirOnly = false; + + /* + * Check if the key is required in the current scan direction, in the + * opposite scan direction _only_, or in neither direction */ if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) requiredSameDir = true; else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) - requiredOppositeDir = true; + requiredOppositeDirOnly = true; /* * If the caller told us the *continuescan flag is known to be true @@ -1422,8 +3722,9 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * Both cases above work except for the row keys, where NULLs could be * found in the middle of matching values. */ - if ((requiredSameDir || (requiredOppositeDir && haveFirstMatch)) && - !(key->sk_flags & SK_ROW_HEADER) && continuescanPrechecked) + if (prechecked && + (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) && + !(key->sk_flags & SK_ROW_HEADER)) continue; if (key->sk_attno > tupnatts) @@ -1434,7 +3735,6 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * right could be any possible value. Assume that truncated * attribute passes the qual. */ - Assert(ScanDirectionIsForward(dir)); Assert(BTreeTupleIsPivot(tuple)); continue; } @@ -1495,6 +3795,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a forward scan, however, we must keep going, because we may * have initially positioned to the start of the index. + * (_bt_advance_array_keys also relies on this behavior during + * forward scans.) */ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsBackward(dir)) @@ -1511,6 +3813,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a backward scan, however, we must keep going, because we * may have initially positioned to the end of the index. + * (_bt_advance_array_keys also relies on this behavior during + * backward scans.) */ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsForward(dir)) @@ -1524,24 +3828,15 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, } /* - * Apply the key-checking function. When the key is required for the - * opposite direction scan, it must be already satisfied as soon as - * there is already match on the page. Except for the NULLs checking, - * which have already done above. + * Apply the key-checking function, though only if we must. + * + * When a key is required in the opposite-of-scan direction _only_, + * then it must already be satisfied if firstmatch=true indicates that + * an earlier tuple from this same page satisfied it earlier on. */ - if (!(requiredOppositeDir && haveFirstMatch)) - { - test = FunctionCall2Coll(&key->sk_func, key->sk_collation, - datum, key->sk_argument); - } - else - { - test = true; - Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation, - datum, key->sk_argument)); - } - - if (!DatumGetBool(test)) + if (!(requiredOppositeDirOnly && firstmatch) && + !DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument))) { /* * Tuple fails this qual. If it's a required qual for the current @@ -1557,7 +3852,19 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, *continuescan = false; /* - * In any case, this indextuple doesn't match the qual. + * If this is a non-required equality-type array key, the tuple + * needs to be checked against every possible array key. Handle + * this by "advancing" the scan key's array to a matching value + * (if we're successful then the tuple might match the qual). + */ + else if (advancenonrequired && + key->sk_strategy == BTEqualStrategyNumber && + (key->sk_flags & SK_SEARCHARRAY)) + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); + + /* + * This indextuple doesn't match the qual. */ return false; } @@ -1574,7 +3881,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * it's not possible for any future tuples in the current scan direction * to pass the qual. * - * This is a subroutine for _bt_checkkeys, which see for more info. + * This is a subroutine for _bt_checkkeys/_bt_check_compare. */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, @@ -1603,7 +3910,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * right could be any possible value. Assume that truncated * attribute passes the qual. */ - Assert(ScanDirectionIsForward(dir)); Assert(BTreeTupleIsPivot(tuple)); cmpresult = 0; if (subkey->sk_flags & SK_ROW_END) @@ -1630,6 +3936,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a forward scan, however, we must keep going, because we may * have initially positioned to the start of the index. + * (_bt_advance_array_keys also relies on this behavior during + * forward scans.) */ if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsBackward(dir)) @@ -1646,6 +3954,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a backward scan, however, we must keep going, because we * may have initially positioned to the end of the index. + * (_bt_advance_array_keys also relies on this behavior during + * backward scans.) */ if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsForward(dir)) @@ -1741,6 +4051,90 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, return result; } +/* + * Determine if a scan with array keys should skip over uninteresting tuples. + * + * This is a subroutine for _bt_checkkeys. Called when _bt_readpage's linear + * search process (started after it finishes reading an initial group of + * matching tuples, used to locate the start of the next group of tuples + * matching the next set of required array keys) has already scanned an + * excessive number of tuples whose key space is "between arrays". + * + * When we perform look ahead successfully, we'll sets pstate.skip, which + * instructs _bt_readpage to skip ahead to that tuple next (could be past the + * end of the scan's leaf page). Pages where the optimization is effective + * will generally still need to skip several times. Each call here performs + * only a single "look ahead" comparison of a later tuple, whose distance from + * the current tuple's offset number is determined by applying heuristics. + */ +static void +_bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, + int tupnatts, TupleDesc tupdesc) +{ + ScanDirection dir = pstate->dir; + OffsetNumber aheadoffnum; + IndexTuple ahead; + + /* Avoid looking ahead when comparing the page high key */ + if (pstate->offnum < pstate->minoff) + return; + + /* + * Don't look ahead when there aren't enough tuples remaining on the page + * (in the current scan direction) for it to be worth our while + */ + if (ScanDirectionIsForward(dir) && + pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE) + return; + else if (ScanDirectionIsBackward(dir) && + pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE) + return; + + /* + * The look ahead distance starts small, and ramps up as each call here + * allows _bt_readpage to skip over more tuples + */ + if (!pstate->targetdistance) + pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE; + else + pstate->targetdistance *= 2; + + /* Don't read past the end (or before the start) of the page, though */ + if (ScanDirectionIsForward(dir)) + aheadoffnum = Min((int) pstate->maxoff, + (int) pstate->offnum + pstate->targetdistance); + else + aheadoffnum = Max((int) pstate->minoff, + (int) pstate->offnum - pstate->targetdistance); + + ahead = (IndexTuple) PageGetItem(pstate->page, + PageGetItemId(pstate->page, aheadoffnum)); + if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts, + false, 0, NULL)) + { + /* + * Success -- instruct _bt_readpage to skip ahead to very next tuple + * after the one we determined was still before the current array keys + */ + if (ScanDirectionIsForward(dir)) + pstate->skip = aheadoffnum + 1; + else + pstate->skip = aheadoffnum - 1; + } + else + { + /* + * Failure -- "ahead" tuple is too far ahead (we were too aggresive). + * + * Reset the number of rechecks, and aggressively reduce the target + * distance (we're much more aggressive here than we were when the + * distance was initially ramped up). + */ + pstate->rechecks = 0; + pstate->targetdistance = Max(pstate->targetdistance / 8, 1); + } +} + /* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 9e35aaf56e..fcf6d1d932 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -628,6 +628,8 @@ ExecIndexOnlyScanEstimate(IndexOnlyScanState *node, EState *estate = node->ss.ps.state; node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys, estate->es_snapshot); shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen); shm_toc_estimate_keys(&pcxt->estimator, 1); diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 2a3264599d..8000feff4c 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -1644,6 +1644,8 @@ ExecIndexScanEstimate(IndexScanState *node, EState *estate = node->ss.ps.state; node->iss_PscanLen = index_parallelscan_estimate(node->iss_RelationDesc, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, estate->es_snapshot); shm_toc_estimate_chunk(&pcxt->estimator, node->iss_PscanLen); shm_toc_estimate_keys(&pcxt->estimator, 1); diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 32c6a8bbdc..2230b13104 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -106,8 +106,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop); + bool *skip_nonnative_saop); static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, List *clauses, List *other_clauses); static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, @@ -706,8 +705,6 @@ eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids, * index AM supports them natively, we should just include them in simple * index paths. If not, we should exclude them while building simple index * paths, and then make a separate attempt to include them in bitmap paths. - * Furthermore, we should consider excluding lower-order ScalarArrayOpExpr - * quals so as to create ordered paths. */ static void get_index_paths(PlannerInfo *root, RelOptInfo *rel, @@ -716,37 +713,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { List *indexpaths; bool skip_nonnative_saop = false; - bool skip_lower_saop = false; ListCell *lc; /* * Build simple index paths using the clauses. Allow ScalarArrayOpExpr - * clauses only if the index AM supports them natively, and skip any such - * clauses for index columns after the first (so that we produce ordered - * paths if possible). + * clauses only if the index AM supports them natively. */ indexpaths = build_index_paths(root, rel, index, clauses, index->predOK, ST_ANYSCAN, - &skip_nonnative_saop, - &skip_lower_saop); - - /* - * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM - * that supports them, then try again including those clauses. This will - * produce paths with more selectivity but no ordering. - */ - if (skip_lower_saop) - { - indexpaths = list_concat(indexpaths, - build_index_paths(root, rel, - index, clauses, - index->predOK, - ST_ANYSCAN, - &skip_nonnative_saop, - NULL)); - } + &skip_nonnative_saop); /* * Submit all the ones that can form plain IndexScan plans to add_path. (A @@ -784,7 +761,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, false, ST_BITMAPSCAN, - NULL, NULL); *bitindexpaths = list_concat(*bitindexpaths, indexpaths); } @@ -817,27 +793,19 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, * to true if we found any such clauses (caller must initialize the variable * to false). If it's NULL, we do not ignore ScalarArrayOpExpr clauses. * - * If skip_lower_saop is non-NULL, we ignore ScalarArrayOpExpr clauses for - * non-first index columns, and we set *skip_lower_saop to true if we found - * any such clauses (caller must initialize the variable to false). If it's - * NULL, we do not ignore non-first ScalarArrayOpExpr clauses, but they will - * result in considering the scan's output to be unordered. - * * 'rel' is the index's heap relation * 'index' is the index for which we want to generate paths * 'clauses' is the collection of indexable clauses (IndexClause nodes) * 'useful_predicate' indicates whether the index has a useful predicate * 'scantype' indicates whether we need plain or bitmap scan support * 'skip_nonnative_saop' indicates whether to accept SAOP if index AM doesn't - * 'skip_lower_saop' indicates whether to accept non-first-column SAOP */ static List * build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop) + bool *skip_nonnative_saop) { List *result = NIL; IndexPath *ipath; @@ -848,12 +816,13 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, List *orderbyclausecols; List *index_pathkeys; List *useful_pathkeys; - bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; int indexcol; + Assert(skip_nonnative_saop != NULL || scantype == ST_BITMAPSCAN); + /* * Check that index supports the desired scan type(s) */ @@ -880,19 +849,11 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * on by btree and possibly other places.) The list can be empty, if the * index AM allows that. * - * found_lower_saop_clause is set true if we accept a ScalarArrayOpExpr - * index clause for a non-first index column. This prevents us from - * assuming that the scan result is ordered. (Actually, the result is - * still ordered if there are equality constraints for all earlier - * columns, but it seems too expensive and non-modular for this code to be - * aware of that refinement.) - * * We also build a Relids set showing which outer rels are required by the * selected clauses. Any lateral_relids are included in that, but not * otherwise accounted for. */ index_clauses = NIL; - found_lower_saop_clause = false; outer_relids = bms_copy(rel->lateral_relids); for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++) { @@ -903,30 +864,18 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexClause *iclause = (IndexClause *) lfirst(lc); RestrictInfo *rinfo = iclause->rinfo; - /* We might need to omit ScalarArrayOpExpr clauses */ - if (IsA(rinfo->clause, ScalarArrayOpExpr)) + if (skip_nonnative_saop && !index->amsearcharray && + IsA(rinfo->clause, ScalarArrayOpExpr)) { - if (!index->amsearcharray) - { - if (skip_nonnative_saop) - { - /* Ignore because not supported by index */ - *skip_nonnative_saop = true; - continue; - } - /* Caller had better intend this only for bitmap scan */ - Assert(scantype == ST_BITMAPSCAN); - } - if (indexcol > 0) - { - if (skip_lower_saop) - { - /* Caller doesn't want to lose index ordering */ - *skip_lower_saop = true; - continue; - } - found_lower_saop_clause = true; - } + /* + * Caller asked us to generate IndexPaths that omit any + * ScalarArrayOpExpr clauses when the underlying index AM + * lacks native support. + * + * We must omit this clause (and tell caller about it). + */ + *skip_nonnative_saop = true; + continue; } /* OK to include this clause */ @@ -956,11 +905,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, /* * 2. Compute pathkeys describing index's ordering, if any, then see how * many of them are actually useful for this query. This is not relevant - * if we are only trying to build bitmap indexscans, nor if we have to - * assume the scan is unordered. + * if we are only trying to build bitmap indexscans. */ pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN && - !found_lower_saop_clause && has_useful_pathkeys(root, rel)); index_is_ordered = (index->sortopfamily != NULL); if (index_is_ordered && pathkeys_possibly_useful) @@ -1212,7 +1159,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, index, &clauseset, useful_predicate, ST_BITMAPSCAN, - NULL, NULL); result = list_concat(result, indexpaths); } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index cea777e9d4..35f8f306ee 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6572,21 +6572,26 @@ genericcostestimate(PlannerInfo *root, selectivityQuals = add_predicate_to_index_quals(index, indexQuals); /* - * Check for ScalarArrayOpExpr index quals, and estimate the number of - * index scans that will be performed. + * If caller didn't give us an estimate for ScalarArrayOpExpr index scans, + * just assume that the number of index descents is the number of distinct + * combinations of array elements from all of the scan's SAOP clauses. */ - num_sa_scans = 1; - foreach(l, indexQuals) + num_sa_scans = costs->num_sa_scans; + if (num_sa_scans < 1) { - RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); - - if (IsA(rinfo->clause, ScalarArrayOpExpr)) + num_sa_scans = 1; + foreach(l, indexQuals) { - ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; - double alength = estimate_array_length(root, lsecond(saop->args)); + RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); - if (alength > 1) - num_sa_scans *= alength; + if (IsA(rinfo->clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; + double alength = estimate_array_length(root, lsecond(saop->args)); + + if (alength > 1) + num_sa_scans *= alength; + } } } @@ -6813,9 +6818,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * For a RowCompareExpr, we consider only the first column, just as * rowcomparesel() does. * - * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N - * index scans not one, but the ScalarArrayOpExpr's operator can be - * considered to act the same as it normally does. + * If there's a ScalarArrayOpExpr in the quals, we'll actually perform up + * to N index descents (not just one), but the ScalarArrayOpExpr's + * operator can be considered to act the same as it normally does. */ indexBoundQuals = NIL; indexcol = 0; @@ -6867,7 +6872,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, clause_op = saop->opno; found_saop = true; - /* count number of SA scans induced by indexBoundQuals only */ + /* estimate SA descents by indexBoundQuals only */ if (alength > 1) num_sa_scans *= alength; } @@ -6930,10 +6935,48 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, NULL); numIndexTuples = btreeSelectivity * index->rel->tuples; + /* + * btree automatically combines individual ScalarArrayOpExpr primitive + * index scans whenever the tuples covered by the next set of array + * keys are close to tuples covered by the current set. That puts a + * natural ceiling on the worst case number of descents -- there + * cannot possibly be more than one descent per leaf page scanned. + * + * Clamp the number of descents to at most 1/3 the number of index + * pages. This avoids implausibly high estimates with low selectivity + * paths, where scans usually require only one or two descents. This + * is most likely to help when there are several SAOP clauses, where + * naively accepting the total number of distinct combinations of + * array elements as the number of descents would frequently lead to + * wild overestimates. + * + * We somewhat arbitrarily don't just make the cutoff the total number + * of leaf pages (we make it 1/3 the total number of pages instead) to + * give the btree code credit for its ability to continue on the leaf + * level with low selectivity scans. + */ + num_sa_scans = Min(num_sa_scans, ceil(index->pages * 0.3333333)); + num_sa_scans = Max(num_sa_scans, 1); + /* * As in genericcostestimate(), we have to adjust for any * ScalarArrayOpExpr quals included in indexBoundQuals, and then round * to integer. + * + * It is tempting to make genericcostestimate behave as if SAOP + * clauses work in almost the same way as scalar operators during + * btree scans, making the top-level scan look like a continuous scan + * (as opposed to num_sa_scans-many primitive index scans). After + * all, btree scans mostly work like that at runtime. However, such a + * scheme would badly bias genericcostestimate's simplistic appraoch + * to calculating numIndexPages through prorating. + * + * Stick with the approach taken by non-native SAOP scans for now. + * genericcostestimate will use the Mackert-Lohman formula to + * compensate for repeat page fetches, even though that definitely + * won't happen during btree scans (not for leaf pages, at least). + * We're usually very pessimistic about the number of primitive index + * scans that will be required, but it's not clear how to do better. */ numIndexTuples = rint(numIndexTuples / num_sa_scans); } @@ -6942,6 +6985,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * Now do generic index cost estimation. */ costs.numIndexTuples = numIndexTuples; + costs.num_sa_scans = num_sa_scans; genericcostestimate(root, path, loop_count, &costs); @@ -6952,9 +6996,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * comparisons to descend a btree of N leaf tuples. We charge one * cpu_operator_cost per comparison. * - * If there are ScalarArrayOpExprs, charge this once per SA scan. The - * ones after the first one are not startup cost so far as the overall - * plan is concerned, so add them only to "total" cost. + * If there are ScalarArrayOpExprs, charge this once per estimated SA + * index descent. The ones after the first one are not startup cost so + * far as the overall plan goes, so just add them to "total" cost. */ if (index->tuples > 1) /* avoid computing log(0) */ { @@ -6971,7 +7015,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * in cases where only a single leaf page is expected to be visited. This * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page * touched. The number of such pages is btree tree height plus one (ie, - * we charge for the leaf page too). As above, charge once per SA scan. + * we charge for the leaf page too). As above, charge once per estimated + * SA index descent. */ descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost; costs.indexStartupCost += descentCost; diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 2c6c307efc..00300dd720 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -194,7 +194,7 @@ typedef void (*amrestrpos_function) (IndexScanDesc scan); */ /* estimate size of parallel scan descriptor */ -typedef Size (*amestimateparallelscan_function) (void); +typedef Size (*amestimateparallelscan_function) (int nkeys, int norderbys); /* prepare for parallel index scan */ typedef void (*aminitparallelscan_function) (void *target); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 8026c2b36d..fdcfbe8db7 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -165,7 +165,8 @@ extern void index_rescan(IndexScanDesc scan, extern void index_endscan(IndexScanDesc scan); extern void index_markpos(IndexScanDesc scan); extern void index_restrpos(IndexScanDesc scan); -extern Size index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot); +extern Size index_parallelscan_estimate(Relation indexRelation, + int nkeys, int norderbys, Snapshot snapshot); extern void index_parallelscan_initialize(Relation heapRelation, Relation indexRelation, Snapshot snapshot, ParallelIndexScanDesc target); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6eb162052e..b9053219a6 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -960,11 +960,20 @@ typedef struct BTScanPosData * moreLeft and moreRight track whether we think there may be matching * index entries to the left and right of the current page, respectively. * We can clear the appropriate one of these flags when _bt_checkkeys() - * returns continuescan = false. + * sets BTReadPageState.continuescan = false. */ bool moreLeft; bool moreRight; + /* + * Direction of the scan at the time that _bt_readpage was called. + * + * Used by btrestrpos to "restore" the scan's array keys by resetting each + * array to its first element's value (first in this scan direction). This + * avoids the need to directly track the array keys in btmarkpos. + */ + ScanDirection dir; + /* * If we are doing an index-only scan, nextTupleOffset is the first free * location in the associated tuple storage workspace. @@ -1022,9 +1031,8 @@ typedef BTScanPosData *BTScanPos; /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ typedef struct BTArrayKeyInfo { - int scan_key; /* index of associated key in arrayKeyData */ + int scan_key; /* index of associated key in keyData */ int cur_elem; /* index of current element in elem_values */ - int mark_elem; /* index of marked element in elem_values */ int num_elems; /* number of elems in current array value */ Datum *elem_values; /* array of num_elems Datums */ } BTArrayKeyInfo; @@ -1037,14 +1045,11 @@ typedef struct BTScanOpaqueData ScanKey keyData; /* array of preprocessed scan keys */ /* workspace for SK_SEARCHARRAY support */ - ScanKey arrayKeyData; /* modified copy of scan->keyData */ - bool arraysStarted; /* Started array keys, but have yet to "reach - * past the end" of all arrays? */ - int numArrayKeys; /* number of equality-type array keys (-1 if - * there are any unsatisfiable array keys) */ - int arrayKeyCount; /* count indicating number of array scan keys - * processed */ + int numArrayKeys; /* number of equality-type array keys */ + bool needPrimScan; /* New prim scan to continue in current dir? */ + bool scanBehind; /* Last array advancement matched -inf attr? */ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ MemoryContext arrayContext; /* scan-lifespan context for array data */ /* info about killed items if any (killedItems is NULL if never used) */ @@ -1075,6 +1080,42 @@ typedef struct BTScanOpaqueData typedef BTScanOpaqueData *BTScanOpaque; +/* + * _bt_readpage state used across _bt_checkkeys calls for a page + */ +typedef struct BTReadPageState +{ + /* Input parameters, set by _bt_readpage for _bt_checkkeys */ + ScanDirection dir; /* current scan direction */ + OffsetNumber minoff; /* Lowest non-pivot tuple's offset */ + OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ + IndexTuple finaltup; /* Needed by scans with array keys */ + BlockNumber prev_scan_page; /* previous _bt_parallel_release block */ + Page page; /* Page being read */ + + /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ + OffsetNumber offnum; /* current tuple's page offset number */ + + /* Output parameter, set by _bt_checkkeys for _bt_readpage */ + OffsetNumber skip; /* Array keys "look ahead" skip offnum */ + bool continuescan; /* Terminate ongoing (primitive) index scan? */ + + /* + * Input and output parameters, set and unset by both _bt_readpage and + * _bt_checkkeys to manage precheck optimizations + */ + bool prechecked; /* precheck set continuescan to 'true'? */ + bool firstmatch; /* at least one match so far? */ + + /* + * Private _bt_checkkeys state used to manage "look ahead" optimization + * (only used during scans with array keys) + */ + int16 rechecks; + int16 targetdistance; + +} BTReadPageState; + /* * We use some private sk_flags bits in preprocessed scan keys. We're allowed * to use bits 16-31 (see skey.h). The uppermost bits are copied from the @@ -1128,7 +1169,7 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, bool indexUnchanged, struct IndexInfo *indexInfo); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); -extern Size btestimateparallelscan(void); +extern Size btestimateparallelscan(int nkeys, int norderbys); extern void btinitparallelscan(void *target); extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); @@ -1149,10 +1190,12 @@ extern bool btcanreturn(Relation index, int attno); /* * prototypes for internal functions in nbtree.c */ -extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno); +extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, + bool first); extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); -extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); +extern void _bt_parallel_primscan_schedule(IndexScanDesc scan, + BlockNumber prev_scan_page); /* * prototypes for functions in nbtdedup.c @@ -1243,15 +1286,11 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); */ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); -extern void _bt_preprocess_array_keys(IndexScanDesc scan); +extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); -extern void _bt_mark_array_keys(IndexScanDesc scan); -extern void _bt_restore_array_keys(IndexScanDesc scan); extern void _bt_preprocess_keys(IndexScanDesc scan); -extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, - int tupnatts, ScanDirection dir, bool *continuescan, - bool requiredMatchedByPrecheck, bool haveFirstMatch); +extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, + IndexTuple tuple, int tupnatts); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 2fa4c4fc1b..f2563ad1cb 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -117,6 +117,9 @@ typedef struct VariableStatData * Callers should initialize all fields of GenericCosts to zero. In addition, * they can set numIndexTuples to some positive value if they have a better * than default way of estimating the number of leaf index tuples visited. + * Similarly, they can set num_sa_scans to some value >= 1 for an index AM + * that doesn't necessarily perform exactly one primitive index scan per + * distinct combination of ScalarArrayOp array elements. */ typedef struct { diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 8311a03c3d..510646cbce 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -189,6 +189,58 @@ select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limi 48 | 8 (1 row) +-- +-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants +-- +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); + QUERY PLAN +------------------------------------------------------------------ + Unique + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[])) +(3 rows) + +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); + hundred +--------- + 47 + 48 + 72 + 82 +(4 rows) + +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; + QUERY PLAN +------------------------------------------------------------------ + Unique + -> Index Only Scan Backward using tenk1_hundred on tenk1 + Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[])) +(3 rows) + +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; + hundred +--------- + 82 + 72 + 48 + 47 +(4 rows) + +explain (costs off) +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; + QUERY PLAN +--------------------------------------------------------------------------------------- + Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = ANY ('{364,366,380}'::integer[])) AND (tenthous = 200000)) +(2 rows) + +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; + thousand +---------- +(0 rows) + -- -- Check correct optimization of LIKE (special index operator support) -- for both indexscan and bitmapscan cases diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 70ab47a92f..cf6eac5734 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1698,6 +1698,12 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; 0 (1 row) +SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1); + count +------- + 1 +(1 row) + DROP INDEX onek_nulltest; CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2 desc nulls last,unique1); SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL; @@ -1910,7 +1916,7 @@ SELECT count(*) FROM dupindexcols (1 row) -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) SELECT unique1 FROM tenk1 @@ -1932,49 +1938,186 @@ ORDER BY unique1; 42 (3 rows) +-- Non-required array scan key on "tenthous": explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------- Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: (thousand < 2) - Filter: (tenthous = ANY ('{1001,3000}'::integer[])) + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand; + thousand | tenthous +----------+---------- + 0 | 3000 + 1 | 1001 +(2 rows) + +-- Non-required array scan key on "tenthous", backward scan: +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + QUERY PLAN +-------------------------------------------------------------------------------- + Index Only Scan Backward using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + thousand | tenthous +----------+---------- + 1 | 1001 + 0 | 3000 +(2 rows) + +-- +-- Check elimination of redundant and contradictory index quals +-- +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = ANY ('{7,8,9}'::integer[]))) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + unique1 +--------- + 7 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{7,14,22}'::integer[])) AND (unique1 = ANY ('{33,44}'::bigint[]))) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + QUERY PLAN +--------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 1)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + unique1 +--------- + 1 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + QUERY PLAN +------------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 12345)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + QUERY PLAN +----------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 >= 42)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + unique1 +--------- + 42 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + QUERY PLAN +---------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 > 42)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + QUERY PLAN +-------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 > 9996) AND (unique1 >= 9999)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + unique1 +--------- + 9999 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + QUERY PLAN +-------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 < 3) AND (unique1 <= 3)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + unique1 +--------- + 0 + 1 + 2 (3 rows) -SELECT thousand, tenthous FROM tenk1 -WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; - thousand | tenthous -----------+---------- - 0 | 3000 - 1 | 1001 +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + QUERY PLAN +------------------------------------------------------------ + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 < 3) AND (unique1 < '-1'::bigint)) (2 rows) -SET enable_indexonlyscan = OFF; +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + unique1 +--------- +(0 rows) + explain (costs off) -SELECT thousand, tenthous FROM tenk1 -WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; QUERY PLAN -------------------------------------------------------------------------------------- - Sort - Sort Key: thousand - -> Index Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(4 rows) - -SELECT thousand, tenthous FROM tenk1 -WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; - thousand | tenthous -----------+---------- - 0 | 3000 - 1 | 1001 + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 < '-1'::bigint)) (2 rows) -RESET enable_indexonlyscan; +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; + unique1 +--------- +(0 rows) + -- -- Check elimination of constant-NULL subexpressions -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 63cddac0d6..8b640c2fc2 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -8880,10 +8880,9 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]); Merge Cond: (j1.id1 = j2.id1) Join Filter: (j2.id2 = j1.id2) -> Index Scan using j1_id1_idx on j1 - -> Index Only Scan using j2_pkey on j2 + -> Index Scan using j2_id1_idx on j2 Index Cond: (id1 >= ANY ('{1,5}'::integer[])) - Filter: ((id1 % 1000) = 1) -(7 rows) +(6 rows) select * from j1 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out index 4ffc5b4c56..87273fa635 100644 --- a/src/test/regress/expected/select_parallel.out +++ b/src/test/regress/expected/select_parallel.out @@ -361,6 +361,7 @@ alter table tenk2 reset (parallel_workers); -- test parallel index scans. set enable_seqscan to off; set enable_bitmapscan to off; +set random_page_cost = 2; explain (costs off) select count((unique1)) from tenk1 where hundred > 1; QUERY PLAN @@ -379,6 +380,30 @@ select count((unique1)) from tenk1 where hundred > 1; 9800 (1 row) +-- Parallel ScalarArrayOp index scan +explain (costs off) + select count((unique1)) from tenk1 + where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); + QUERY PLAN +--------------------------------------------------------------------- + Finalize Aggregate + InitPlan 1 + -> Aggregate + -> Function Scan on generate_series i + -> Gather + Workers Planned: 4 + -> Partial Aggregate + -> Parallel Index Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = ANY ((InitPlan 1).col1)) +(9 rows) + +select count((unique1)) from tenk1 +where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); + count +------- + 700 +(1 row) + -- test parallel index-only scans. explain (costs off) select count(*) from tenk1 where thousand > 95; diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index ef84354234..0d2a33f370 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -135,6 +135,21 @@ explain (costs off) select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1; select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1; +-- +-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants +-- +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); + +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; + +explain (costs off) +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; + -- -- Check correct optimization of LIKE (special index operator support) -- for both indexscan and bitmapscan cases diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index d49ce9f300..e296891cab 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -668,6 +668,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; +SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1); DROP INDEX onek_nulltest; @@ -753,7 +754,7 @@ SELECT count(*) FROM dupindexcols WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX'; -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) @@ -765,6 +766,7 @@ SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) ORDER BY unique1; +-- Non-required array scan key on "tenthous": explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -774,18 +776,68 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; -SET enable_indexonlyscan = OFF; - +-- Non-required array scan key on "tenthous", backward scan: explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +ORDER BY thousand DESC, tenthous DESC; SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +ORDER BY thousand DESC, tenthous DESC; -RESET enable_indexonlyscan; +-- +-- Check elimination of redundant and contradictory index quals +-- +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; -- -- Check elimination of constant-NULL subexpressions diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index c43a5b2119..20376c03fa 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -137,11 +137,19 @@ alter table tenk2 reset (parallel_workers); -- test parallel index scans. set enable_seqscan to off; set enable_bitmapscan to off; +set random_page_cost = 2; explain (costs off) select count((unique1)) from tenk1 where hundred > 1; select count((unique1)) from tenk1 where hundred > 1; +-- Parallel ScalarArrayOp index scan +explain (costs off) + select count((unique1)) from tenk1 + where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); +select count((unique1)) from tenk1 +where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); + -- test parallel index-only scans. explain (costs off) select count(*) from tenk1 where thousand > 95; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 01845ee71d..f87e8b80ec 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -208,8 +208,10 @@ BTPageStat BTPageState BTParallelScanDesc BTPendingFSM +BTReadPageState BTScanInsert BTScanInsertData +BTScanKeyPreproc BTScanOpaque BTScanOpaqueData BTScanPos