From 5bf748b86bc6786a3fc57fc7ce296c37da6564b0 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 6 Apr 2024 11:47:10 -0400 Subject: [PATCH] Enhance nbtree ScalarArrayOp execution. Commit 9e8da0f7 taught nbtree to handle ScalarArrayOpExpr quals natively. This works by pushing down the full context (the array keys) to the nbtree index AM, enabling it to execute multiple primitive index scans that the planner treats as one continuous index scan/index path. This earlier enhancement enabled nbtree ScalarArrayOp index-only scans. It also allowed scans with ScalarArrayOp quals to return ordered results (with some notable restrictions, described further down). Take this general approach a lot further: teach nbtree SAOP index scans to decide how to execute ScalarArrayOp scans (when and where to start the next primitive index scan) based on physical index characteristics. This can be far more efficient. All SAOP scans will now reliably avoid duplicative leaf page accesses (just like any other nbtree index scan). SAOP scans whose array keys are naturally clustered together now require far fewer index descents, since we'll reliably avoid starting a new primitive scan just to get to a later offset from the same leaf page. The scan's arrays now advance using binary searches for the array element that best matches the next tuple's attribute value. Required scan key arrays (i.e. arrays from scan keys that can terminate the scan) ratchet forward in lockstep with the index scan. Non-required arrays (i.e. arrays from scan keys that can only exclude non-matching tuples) "advance" without the process ever rolling over to a higher-order array. Naturally, only required SAOP scan keys trigger skipping over leaf pages (non-required arrays cannot safely end or start primitive index scans). Consequently, even index scans of a composite index with a high-order inequality scan key (which we'll mark required) and a low-order SAOP scan key (which we won't mark required) now avoid repeating leaf page accesses -- that benefit isn't limited to simpler equality-only cases. In general, all nbtree index scans now output tuples as if they were one continuous index scan -- even scans that mix a high-order inequality with lower-order SAOP equalities reliably output tuples in index order. This allows us to remove a couple of special cases that were applied when building index paths with SAOP clauses during planning. Bugfix commit 807a40c5 taught the planner to avoid generating unsafe path keys: path keys on a multicolumn index path, with a SAOP clause on any attribute beyond the first/most significant attribute. These cases are now all safe, so we go back to generating path keys without regard for the presence of SAOP clauses (just like with any other clause type). Affected queries can now exploit scan output order in all the usual ways (e.g., certain "ORDER BY ... LIMIT n" queries can now terminate early). Also undo changes from follow-up bugfix commit a4523c5a, which taught the planner to produce alternative index paths, with path keys, but without low-order SAOP index quals (filter quals were used instead). We'll no longer generate these alternative paths, since they can no longer offer any meaningful advantages over standard index qual paths. Affected queries thereby avoid all of the disadvantages that come from using filter quals within index scan nodes. They can avoid extra heap page accesses from using filter quals to exclude non-matching tuples (index quals will never have that problem). They can also skip over irrelevant sections of the index in more cases (though only when nbtree determines that starting another primitive scan actually makes sense). There is a theoretical risk that removing restrictions on SAOP index paths from the planner will break compatibility with amcanorder-based index AMs maintained as extensions. Such an index AM could have the same limitations around ordered SAOP scans as nbtree had up until now. Adding a pro forma incompatibility item about the issue to the Postgres 17 release notes seems like a good idea. Author: Peter Geoghegan Author: Matthias van de Meent Reviewed-By: Heikki Linnakangas Reviewed-By: Matthias van de Meent Reviewed-By: Tomas Vondra Discussion: https://postgr.es/m/CAH2-Wz=ksvN_sjcnD1+Bt-WtifRA5ok48aDYnq3pkKhxgMQpcw@mail.gmail.com --- doc/src/sgml/indexam.sgml | 10 +- doc/src/sgml/monitoring.sgml | 13 + src/backend/access/index/indexam.c | 10 +- src/backend/access/nbtree/nbtree.c | 202 +- src/backend/access/nbtree/nbtsearch.c | 249 +- src/backend/access/nbtree/nbtutils.c | 2910 +++++++++++++++-- src/backend/executor/nodeIndexonlyscan.c | 2 + src/backend/executor/nodeIndexscan.c | 2 + src/backend/optimizer/path/indxpath.c | 90 +- src/backend/utils/adt/selfuncs.c | 83 +- src/include/access/amapi.h | 2 +- src/include/access/genam.h | 3 +- src/include/access/nbtree.h | 79 +- src/include/utils/selfuncs.h | 3 + src/test/regress/expected/btree_index.out | 52 + src/test/regress/expected/create_index.out | 203 +- src/test/regress/expected/join.out | 5 +- src/test/regress/expected/select_parallel.out | 25 + src/test/regress/sql/btree_index.sql | 15 + src/test/regress/sql/create_index.sql | 64 +- src/test/regress/sql/select_parallel.sql | 8 + src/tools/pgindent/typedefs.list | 2 + 22 files changed, 3470 insertions(+), 562 deletions(-) diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index b68daa55ae..76ac0fcddd 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -809,7 +809,8 @@ amrestrpos (IndexScanDesc scan); Size -amestimateparallelscan (void); +amestimateparallelscan (int nkeys, + int norderbys); Estimate and return the number of bytes of dynamic shared memory which the access method will be needed to perform a parallel scan. (This number @@ -817,6 +818,13 @@ amestimateparallelscan (void); AM-independent data in ParallelIndexScanDescData.) + + The nkeys and norderbys + parameters indicate the number of quals and ordering operators that will be + used in the scan; the same values will be passed to amrescan. + Note that the actual values of the scan keys aren't provided yet. + + It is not necessary to implement this function for access methods which do not support parallel scans or for which the number of additional bytes diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index e1e96ba7c4..053da8d6e4 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4064,6 +4064,19 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + Queries that use certain SQL constructs to search for + rows matching any value out of a list or array of multiple scalar values + (see ) perform multiple + primitive index scans (up to one primitive scan per scalar + value) during query execution. Each internal primitive index scan + increments pg_stat_all_indexes.idx_scan, + so it's possible for the count of index scans to significantly exceed the + total number of index scan executor node executions. + + + diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 78ac3b1abb..7510159fc8 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -449,13 +449,10 @@ index_restrpos(IndexScanDesc scan) /* * index_parallelscan_estimate - estimate shared memory for parallel scan - * - * Currently, we don't pass any information to the AM-specific estimator, - * so it can probably only return a constant. In the future, we might need - * to pass more information. */ Size -index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot) +index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys, + Snapshot snapshot) { Size nbytes; @@ -474,7 +471,8 @@ index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot) */ if (indexRelation->rd_indam->amestimateparallelscan != NULL) nbytes = add_size(nbytes, - indexRelation->rd_indam->amestimateparallelscan()); + indexRelation->rd_indam->amestimateparallelscan(nkeys, + norderbys)); return nbytes; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 41df1027d2..686a3206f7 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -40,6 +40,9 @@ /* * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. * + * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the + * scan to advance it via another call to _bt_first. + * * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to * a new page; others must wait. * @@ -47,11 +50,11 @@ * to a new page; some process can start doing that. * * BTPARALLEL_DONE indicates that the scan is complete (including error exit). - * We reach this state once for every distinct combination of array keys. */ typedef enum { BTPARALLEL_NOT_INITIALIZED, + BTPARALLEL_NEED_PRIMSCAN, BTPARALLEL_ADVANCING, BTPARALLEL_IDLE, BTPARALLEL_DONE, @@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ - int btps_arrayKeyCount; /* count indicating number of array scan - * keys processed by parallel scan */ - slock_t btps_mutex; /* protects above variables */ + slock_t btps_mutex; /* protects above variables, btps_arrElems */ ConditionVariable btps_cv; /* used to synchronize parallel scan */ + + /* + * btps_arrElems is used when scans need to schedule another primitive + * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys. + */ + int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]; } BTParallelScanDescData; typedef struct BTParallelScanDescData *BTParallelScanDesc; @@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* btree indexes are never lossy */ scan->xs_recheck = false; - /* - * If we have any array keys, initialize them during first call for a - * scan. We can't do this in btrescan because we don't know the scan - * direction at that time. - */ - if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) - { - /* punt if we have any unsatisfiable array keys */ - if (so->numArrayKeys < 0) - return false; - - _bt_start_array_keys(scan, dir); - } - - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* @@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* If we have a tuple, return it ... */ if (res) break; - /* ... otherwise see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + /* ... otherwise see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir)); return res; } @@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) int64 ntids = 0; ItemPointer heapTid; - /* - * If we have any array keys, initialize them. - */ - if (so->numArrayKeys) - { - /* punt if we have any unsatisfiable array keys */ - if (so->numArrayKeys < 0) - return ntids; - - _bt_start_array_keys(scan, ForwardScanDirection); - } - - /* This loop handles advancing to the next array elements, if any */ + /* Each loop iteration performs another primitive index scan */ do { /* Fetch the first page & tuple */ @@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) ntids++; } } - /* Now see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); + /* Now see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection)); return ntids; } @@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys) else so->keyData = NULL; - so->arrayKeyData = NULL; /* assume no array keys for now */ - so->arraysStarted = false; - so->numArrayKeys = 0; + so->needPrimScan = false; + so->scanBehind = false; so->arrayKeys = NULL; + so->orderProcs = NULL; so->arrayContext = NULL; so->killedItems = NULL; /* until needed */ @@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, } so->markItemIndex = -1; - so->arrayKeyCount = 0; + so->needPrimScan = false; + so->scanBehind = false; BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); @@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */ - - /* If any keys are SK_SEARCHARRAY type, set up array-key info */ - _bt_preprocess_array_keys(scan); + so->numArrayKeys = 0; /* ditto */ } /* @@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan) /* Release storage */ if (so->keyData != NULL) pfree(so->keyData); - /* so->arrayKeyData and so->arrayKeys are in arrayContext */ + /* so->arrayKeys and so->orderProcs are in arrayContext */ if (so->arrayContext != NULL) MemoryContextDelete(so->arrayContext); if (so->killedItems != NULL) @@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan) BTScanPosInvalidate(so->markPos); so->markItemIndex = -1; } - - /* Also record the current positions of any array keys */ - if (so->numArrayKeys) - _bt_mark_array_keys(scan); } /* @@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* Restore the marked positions of any array keys */ - if (so->numArrayKeys) - _bt_restore_array_keys(scan); - if (so->markItemIndex >= 0) { /* @@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan) if (so->currTuples) memcpy(so->currTuples, so->markTuples, so->markPos.nextTupleOffset); + /* Reset the scan's array keys (see _bt_steppage for why) */ + if (so->numArrayKeys) + { + _bt_start_array_keys(scan, so->currPos.dir); + so->needPrimScan = false; + } } else BTScanPosInvalidate(so->currPos); @@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan) * btestimateparallelscan -- estimate storage for BTParallelScanDescData */ Size -btestimateparallelscan(void) +btestimateparallelscan(int nkeys, int norderbys) { - return sizeof(BTParallelScanDescData); + /* Pessimistically assume all input scankeys will be output with arrays */ + return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys; } /* @@ -572,7 +551,6 @@ btinitparallelscan(void *target) SpinLockInit(&bt_target->btps_mutex); bt_target->btps_scanPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - bt_target->btps_arrayKeyCount = 0; ConditionVariableInit(&bt_target->btps_cv); } @@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan) SpinLockAcquire(&btscan->btps_mutex); btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount = 0; SpinLockRelease(&btscan->btps_mutex); } @@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan) * or _bt_parallel_done(). * * The return value is true if we successfully seized the scan and false - * if we did not. The latter case occurs if no pages remain for the current - * set of scankeys. + * if we did not. The latter case occurs if no pages remain. * * If the return value is true, *pageno returns the next or current page * of the scan (depending on the scan direction). An invalid block number - * means the scan hasn't yet started, and P_NONE means we've reached the end. + * means the scan hasn't yet started, or that caller needs to start the next + * primitive index scan (if it's the latter case we'll set so.needPrimScan). * The first time a participating process reaches the last page, it will return * true and set *pageno to P_NONE; after that, further attempts to seize the * scan will return false. * * Callers should ignore the value of pageno if the return value is false. + * + * Callers that are in a position to start a new primitive index scan must + * pass first=true (all other callers pass first=false). We just return false + * for first=false callers that require another primitive index scan. */ bool -_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - BTPS_State pageStatus; bool exit_loop = false; bool status = true; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) *pageno = P_NONE; + if (first) + { + /* + * Initialize array related state when called from _bt_first, assuming + * that this will either be the first primitive index scan for the + * scan, or a previous explicitly scheduled primitive scan. + * + * Note: so->needPrimScan is only set when a scheduled primitive index + * scan is set to be performed in caller's worker process. It should + * not be set here by us for the first primitive scan, nor should we + * ever set it for a parallel scan that has no array keys. + */ + so->needPrimScan = false; + so->scanBehind = false; + } + else + { + /* + * Don't attempt to seize the scan when backend requires another + * primitive index scan unless we're in a position to start it now + */ + if (so->needPrimScan) + return false; + } + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); while (1) { SpinLockAcquire(&btscan->btps_mutex); - pageStatus = btscan->btps_pageStatus; - if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + if (btscan->btps_pageStatus == BTPARALLEL_DONE) { - /* Parallel scan has already advanced to a new set of scankeys. */ + /* We're done with this parallel index scan */ status = false; } - else if (pageStatus == BTPARALLEL_DONE) + else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN) { + Assert(so->numArrayKeys); + /* - * We're done with this set of scankeys. This may be the end, or - * there could be more sets to try. + * If we can start another primitive scan right away, do so. + * Otherwise just wait. */ - status = false; + if (first) + { + btscan->btps_pageStatus = BTPARALLEL_ADVANCING; + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + ScanKey skey = &so->keyData[array->scan_key]; + + array->cur_elem = btscan->btps_arrElems[i]; + skey->sk_argument = array->elem_values[array->cur_elem]; + } + so->needPrimScan = true; + so->scanBehind = false; + *pageno = InvalidBlockNumber; + exit_loop = true; + } } - else if (pageStatus != BTPARALLEL_ADVANCING) + else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING) { /* * We have successfully seized control of the scan for the purpose @@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) * _bt_parallel_release() -- Complete the process of advancing the scan to a * new page. We now have the new value btps_scanPage; some other backend * can now begin advancing the scan. + * + * Callers whose scan uses array keys must save their scan_page argument so + * that it can be passed to _bt_parallel_primscan_schedule, should caller + * determine that another primitive index scan is required. If that happens, + * scan_page won't be scanned by any backend (unless the next primitive index + * scan lands on scan_page). */ void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) @@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) void _bt_parallel_done(IndexScanDesc scan) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; bool status_changed = false; @@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan) parallel_scan->ps_offset); /* - * Mark the parallel scan as done for this combination of scan keys, - * unless some other process already did so. See also - * _bt_advance_array_keys. + * Mark the parallel scan as done, unless some other process did so + * already */ SpinLockAcquire(&btscan->btps_mutex); - if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && - btscan->btps_pageStatus != BTPARALLEL_DONE) + if (btscan->btps_pageStatus != BTPARALLEL_DONE) { btscan->btps_pageStatus = BTPARALLEL_DONE; status_changed = true; @@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan) } /* - * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array - * keys. + * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * - * Updates the count of array keys processed for both local and parallel - * scans. + * Caller passes the block number most recently passed to _bt_parallel_release + * by its backend. Caller successfully schedules the next primitive index scan + * if the shared parallel state hasn't been seized since caller's backend last + * advanced the scan. */ void -_bt_parallel_advance_array_keys(IndexScanDesc scan) +_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; + Assert(so->numArrayKeys); + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); - so->arrayKeyCount++; SpinLockAcquire(&btscan->btps_mutex); - if (btscan->btps_pageStatus == BTPARALLEL_DONE) + if (btscan->btps_scanPage == prev_scan_page && + btscan->btps_pageStatus == BTPARALLEL_IDLE) { btscan->btps_scanPage = InvalidBlockNumber; - btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount++; + btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; + + /* Serialize scan's current array keys */ + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *array = &so->arrayKeys[i]; + + btscan->btps_arrElems[i] = array->cur_elem; + } } SpinLockRelease(&btscan->btps_mutex); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index e3fff90d8e..d241e8ea1d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -907,7 +907,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ if (!so->qual_ok) { - /* Notify any other workers that we're done with this scan key. */ _bt_parallel_done(scan); return false; } @@ -917,10 +916,22 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * scan has not started, proceed to find out first leaf page in the usual * way while keeping other participating processes waiting. If the scan * has already begun, use the page number from the shared structure. + * + * When a parallel scan has another primitive index scan scheduled, a + * parallel worker will seize the scan for that purpose now. This is + * similar to the case where the top-level scan hasn't started. */ if (scan->parallel_scan != NULL) { - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, true); + + /* + * Initialize arrays (when _bt_parallel_seize didn't already set up + * the next primitive index scan) + */ + if (so->numArrayKeys && !so->needPrimScan) + _bt_start_array_keys(scan, dir); + if (!status) return false; else if (blkno == P_NONE) @@ -935,6 +946,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) goto readcomplete; } } + else if (so->numArrayKeys && !so->needPrimScan) + { + /* + * First _bt_first call (for current btrescan) without parallelism. + * + * Initialize arrays, and the corresponding scan keys that were just + * output by _bt_preprocess_keys. + */ + _bt_start_array_keys(scan, dir); + } /*---------- * Examine the scan keys to discover where we need to start the scan. @@ -980,6 +1001,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * * The selected scan keys (at most one per index column) are remembered by * storing their addresses into the local startKeys[] array. + * + * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start + * the next primitive index scan (for scans with array keys) based in part + * on an understanding of how it'll enable us to reposition the scan. + * They're directly aware of how we'll sometimes cons up an explicit + * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a + * symmetric "deduce NOT NULL" rule of their own. This allows top-level + * scans to skip large groups of NULLs through repeated deductions about + * key strictness (for a required inequality key) and whether NULLs in the + * key's index column are stored last or first (relative to non-NULLs). + * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might + * need to be kept in sync. *---------- */ strat_total = BTEqualStrategyNumber; @@ -1502,7 +1535,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * We scan the current page starting at offnum and moving in the indicated * direction. All items matching the scan keys are loaded into currPos.items. * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports - * that there can be no more matching tuples in the current scan direction. + * that there can be no more matching tuples in the current scan direction + * (could just be for the current primitive index scan when scan has arrays). * * _bt_first caller passes us an offnum returned by _bt_binsrch, which might * be an out of bounds offnum such as "maxoff + 1" in certain corner cases. @@ -1527,11 +1561,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; - int itemIndex; - bool continuescan; - int indnatts; - bool continuescanPrechecked; - bool haveFirstMatch = false; + BTReadPageState pstate; + bool arrayKeys; + int itemIndex, + indnatts; /* * We must have the buffer pinned and locked, but the usual macro can't be @@ -1546,16 +1579,32 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (scan->parallel_scan) { if (ScanDirectionIsForward(dir)) - _bt_parallel_release(scan, opaque->btpo_next); + pstate.prev_scan_page = opaque->btpo_next; else - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf); + + _bt_parallel_release(scan, pstate.prev_scan_page); } - continuescan = true; /* default assumption */ indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + arrayKeys = so->numArrayKeys != 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); + /* initialize page-level state that we'll pass to _bt_checkkeys */ + pstate.dir = dir; + pstate.minoff = minoff; + pstate.maxoff = maxoff; + pstate.finaltup = NULL; + pstate.page = page; + pstate.offnum = InvalidOffsetNumber; + pstate.skip = InvalidOffsetNumber; + pstate.continuescan = true; /* default assumption */ + pstate.prechecked = false; + pstate.firstmatch = false; + pstate.rechecks = 0; + pstate.targetdistance = 0; + /* * We note the buffer's block number so that we can release the pin later. * This allows us to re-read the buffer if it is needed again for hinting. @@ -1598,10 +1647,34 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * corresponding value from the last item on the page. So checking with * the last item on the page would give a more precise answer. * - * We skip this for the first page in the scan to evade the possible - * slowdown of the point queries. + * We skip this for the first page read by each (primitive) scan, to avoid + * slowing down point queries. They typically don't stand to gain much + * when the optimization can be applied, and are more likely to notice the + * overhead of the precheck. + * + * The optimization is unsafe and must be avoided whenever _bt_checkkeys + * just set a low-order required array's key to the best available match + * for a truncated -inf attribute value from the prior page's high key + * (array element 0 is always the best available match in this scenario). + * It's quite likely that matches for array element 0 begin on this page, + * but the start of matches won't necessarily align with page boundaries. + * When the start of matches is somewhere in the middle of this page, it + * would be wrong to treat page's final non-pivot tuple as representative. + * Doing so might lead us to treat some of the page's earlier tuples as + * being part of a group of tuples thought to satisfy the required keys. + * + * Note: Conversely, in the case where the scan's arrays just advanced + * using the prior page's HIKEY _without_ advancement setting scanBehind, + * the start of matches must be aligned with page boundaries, which makes + * it safe to attempt the optimization here now. It's also safe when the + * prior page's HIKEY simply didn't need to advance any required array. In + * both cases we can safely assume that the _first_ tuple from this page + * must be >= the current set of array keys/equality constraints. And so + * if the final tuple is == those same keys (and also satisfies any + * required < or <= strategy scan keys) during the precheck, we can safely + * assume that this must also be true of all earlier tuples from the page. */ - if (!firstPage && minoff < maxoff) + if (!firstPage && !so->scanBehind && minoff < maxoff) { ItemId iid; IndexTuple itup; @@ -1609,22 +1682,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff); itup = (IndexTuple) PageGetItem(page, iid); - /* - * Do the precheck. Note that we pass the pointer to the - * 'continuescanPrechecked' to the 'continuescan' argument. That will - * set flag to true if all required keys are satisfied and false - * otherwise. - */ - (void) _bt_checkkeys(scan, itup, indnatts, dir, - &continuescanPrechecked, false, false); - } - else - { - continuescanPrechecked = false; + /* Call with arrayKeys=false to avoid undesirable side-effects */ + _bt_checkkeys(scan, &pstate, false, itup, indnatts); + pstate.prechecked = pstate.continuescan; + pstate.continuescan = true; /* reset */ } if (ScanDirectionIsForward(dir)) { + /* SK_SEARCHARRAY forward scans must provide high key up front */ + if (arrayKeys && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in ascending order */ itemIndex = 0; @@ -1649,23 +1722,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, - continuescanPrechecked, - haveFirstMatch); + pstate.offnum = offnum; + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); /* - * If the result of prechecking required keys was true, then in - * assert-enabled builds we also recheck that the _bt_checkkeys() - * result is the same. + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) */ - Assert((!continuescanPrechecked && haveFirstMatch) || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false, false)); + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum < pstate.skip); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + if (passes_quals) { /* tuple passes all scan key conditions */ - haveFirstMatch = true; + pstate.firstmatch = true; if (!BTreeTupleIsPosting(itup)) { /* Remember it */ @@ -1696,7 +1774,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } } /* When !continuescan, there can't be any more matches, so stop */ - if (!continuescan) + if (!pstate.continuescan) break; offnum = OffsetNumberNext(offnum); @@ -1713,17 +1791,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * only appear on non-pivot tuples on the right sibling page are * common. */ - if (continuescan && !P_RIGHTMOST(opaque)) + if (pstate.continuescan && !P_RIGHTMOST(opaque)) { ItemId iid = PageGetItemId(page, P_HIKEY); IndexTuple itup = (IndexTuple) PageGetItem(page, iid); int truncatt; truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); - _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false); + pstate.prechecked = false; /* precheck didn't cover HIKEY */ + _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt); } - if (!continuescan) + if (!pstate.continuescan) so->currPos.moreRight = false; Assert(itemIndex <= MaxTIDsPerBTreePage); @@ -1733,6 +1812,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } else { + /* SK_SEARCHARRAY backward scans must provide final tuple up front */ + if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, minoff); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in descending order */ itemIndex = MaxTIDsPerBTreePage; @@ -1772,23 +1859,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, - continuescanPrechecked, - haveFirstMatch); + pstate.offnum = offnum; + passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, + itup, indnatts); /* - * If the result of prechecking required keys was true, then in - * assert-enabled builds we also recheck that the _bt_checkkeys() - * result is the same. + * Check if we need to skip ahead to a later tuple (only possible + * when the scan uses array keys) */ - Assert((!continuescanPrechecked && !haveFirstMatch) || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false, false)); + if (arrayKeys && OffsetNumberIsValid(pstate.skip)) + { + Assert(!passes_quals && pstate.continuescan); + Assert(offnum > pstate.skip); + + offnum = pstate.skip; + pstate.skip = InvalidOffsetNumber; + continue; + } + if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions */ - haveFirstMatch = true; + pstate.firstmatch = true; if (!BTreeTupleIsPosting(itup)) { /* Remember it */ @@ -1824,7 +1916,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, } } } - if (!continuescan) + if (!pstate.continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; @@ -1970,6 +2062,31 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->currPos.nextTupleOffset); so->markPos.itemIndex = so->markItemIndex; so->markItemIndex = -1; + + /* + * If we're just about to start the next primitive index scan + * (possible with a scan that has arrays keys, and needs to skip to + * continue in the current scan direction), moreLeft/moreRight only + * indicate the end of the current primitive index scan. They must + * never be taken to indicate that the top-level index scan has ended + * (that would be wrong). + * + * We could handle this case by treating the current array keys as + * markPos state. But depending on the current array state like this + * would add complexity. Instead, we just unset markPos's copy of + * moreRight or moreLeft (whichever might be affected), while making + * btrestpos reset the scan's arrays to their initial scan positions. + * In effect, btrestpos leaves advancing the arrays up to the first + * _bt_readpage call (that takes place after it has restored markPos). + */ + Assert(so->markPos.dir == dir); + if (so->needPrimScan) + { + if (ScanDirectionIsForward(dir)) + so->markPos.moreRight = true; + else + so->markPos.moreLeft = true; + } } if (ScanDirectionIsForward(dir)) @@ -1981,7 +2098,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * Seize the scan to get the next block number; if the scan has * ended already, bail out. */ - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); if (!status) { /* release the previous buffer, if pinned */ @@ -2013,7 +2130,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * Seize the scan to get the current block number; if the scan has * ended already, bail out. */ - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); BTScanPosUnpinIfPinned(so->currPos); if (!status) { @@ -2097,7 +2214,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) if (scan->parallel_scan != NULL) { _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); if (!status) { BTScanPosInvalidate(so->currPos); @@ -2193,7 +2310,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) if (scan->parallel_scan != NULL) { _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno); + status = _bt_parallel_seize(scan, &blkno, false); if (!status) { BTScanPosInvalidate(so->currPos); @@ -2218,6 +2335,8 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + Assert(!so->needPrimScan); + _bt_initialize_more_data(so, dir); if (!_bt_readnextpage(scan, blkno, dir)) @@ -2524,14 +2643,22 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) } /* - * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately - * for scan direction + * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir + * from currPos */ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) { - /* initialize moreLeft/moreRight appropriately for scan direction */ - if (ScanDirectionIsForward(dir)) + so->currPos.dir = dir; + if (so->needPrimScan) + { + Assert(so->numArrayKeys); + + so->currPos.moreLeft = true; + so->currPos.moreRight = true; + so->needPrimScan = false; + } + else if (ScanDirectionIsForward(dir)) { so->currPos.moreLeft = false; so->currPos.moreRight = true; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index d50317096d..e963de78a7 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -29,29 +29,77 @@ #include "utils/memutils.h" #include "utils/rel.h" +#define LOOK_AHEAD_REQUIRED_RECHECKS 3 +#define LOOK_AHEAD_DEFAULT_DISTANCE 5 typedef struct BTSortArrayContext { - FmgrInfo flinfo; + FmgrInfo *sortproc; Oid collation; bool reverse; } BTSortArrayContext; +typedef struct BTScanKeyPreproc +{ + ScanKey skey; + int ikey; + int arrayidx; +} BTScanKeyPreproc; + +static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype, + FmgrInfo *orderproc, FmgrInfo **sortprocp); static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, - StrategyNumber strat, + Oid elemtype, StrategyNumber strat, Datum *elems, int nelems); -static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, - bool reverse, - Datum *elems, int nelems); +static int _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, + bool reverse, Datum *elems, int nelems); +static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey, + FmgrInfo *sortproc, bool reverse, + Oid origelemtype, Oid nextelemtype, + Datum *elems_orig, int *nelems_orig, + Datum *elems_next, int nelems_next); +static bool _bt_compare_array_scankey_args(IndexScanDesc scan, + ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, + bool *qual_ok); +static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan); +static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap); static int _bt_compare_array_elements(const void *a, const void *b, void *arg); +static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur); +static int _bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result); +static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); +static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); +static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, TupleDesc tupdesc, int tupnatts, + bool readpagetup, int sktrig, bool *scanBehind); +static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int sktrig, bool sktrig_required); +#ifdef USE_ASSERT_CHECKING +static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); +static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); +#endif static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, + BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result); static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); static void _bt_mark_scankey_required(ScanKey skey); +static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool advancenonrequired, bool prechecked, bool firstmatch, + bool *continuescan, int *ikey); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan); +static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, + int tupnatts, TupleDesc tupdesc); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); @@ -188,29 +236,55 @@ _bt_freestack(BTStack stack) * * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and * set up BTArrayKeyInfo info for each one that is an equality-type key. - * Prepare modified scan keys in so->arrayKeyData, which will hold the current - * array elements during each primitive indexscan operation. For inequality - * array keys, it's sufficient to find the extreme element value and replace - * the whole array with that scalar value. + * Returns modified scan keys as input for further, standard preprocessing. * - * Note: the reason we need so->arrayKeyData, rather than just scribbling - * on scan->keyData, is that callers are permitted to call btrescan without - * supplying a new set of scankey data. + * Currently we perform two kinds of preprocessing to deal with redundancies. + * For inequality array keys, it's sufficient to find the extreme element + * value and replace the whole array with that scalar value. This eliminates + * all but one array element as redundant. Similarly, we are capable of + * "merging together" multiple equality array keys (from two or more input + * scan keys) into a single output scan key containing only the intersecting + * array elements. This can eliminate many redundant array elements, as well + * as eliminating whole array scan keys as redundant. It can also allow us to + * detect contradictory quals. + * + * It is convenient for _bt_preprocess_keys caller to have to deal with no + * more than one equality strategy array scan key per index attribute. We'll + * always be able to set things up that way when complete opfamilies are used. + * Eliminated array scan keys can be recognized as those that have had their + * sk_strategy field set to InvalidStrategy here by us. Caller should avoid + * including these in the scan's so->keyData[] output array. + * + * We set the scan key references from the scan's BTArrayKeyInfo info array to + * offsets into the temp modified input array returned to caller. Scans that + * have array keys should call _bt_preprocess_array_keys_final when standard + * preprocessing steps are complete. This will convert the scan key offset + * references into references to the scan's so->keyData[] output scan keys. + * + * Note: the reason we need to return a temp scan key array, rather than just + * scribbling on scan->keyData, is that callers are permitted to call btrescan + * without supplying a new set of scankey data. */ -void +static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; int numberOfKeys = scan->numberOfKeys; - int16 *indoption = scan->indexRelation->rd_indoption; + int16 *indoption = rel->rd_indoption; int numArrayKeys; + int origarrayatt = InvalidAttrNumber, + origarraykey = -1; + Oid origelemtype = InvalidOid; ScanKey cur; - int i; MemoryContext oldContext; + ScanKey arrayKeyData; /* modified copy of scan->keyData */ + + Assert(numberOfKeys); /* Quick check to see if there are any array keys */ numArrayKeys = 0; - for (i = 0; i < numberOfKeys; i++) + for (int i = 0; i < numberOfKeys; i++) { cur = &scan->keyData[i]; if (cur->sk_flags & SK_SEARCHARRAY) @@ -220,20 +294,15 @@ _bt_preprocess_array_keys(IndexScanDesc scan) /* If any arrays are null as a whole, we can quit right now. */ if (cur->sk_flags & SK_ISNULL) { - so->numArrayKeys = -1; - so->arrayKeyData = NULL; - return; + so->qual_ok = false; + return NULL; } } } /* Quit if nothing to do. */ if (numArrayKeys == 0) - { - so->numArrayKeys = 0; - so->arrayKeyData = NULL; - return; - } + return NULL; /* * Make a scan-lifespan context to hold array-associated data, or reset it @@ -249,18 +318,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan) oldContext = MemoryContextSwitchTo(so->arrayContext); /* Create modifiable copy of scan->keyData in the workspace context */ - so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); - memcpy(so->arrayKeyData, - scan->keyData, - scan->numberOfKeys * sizeof(ScanKeyData)); + arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData)); + memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData)); /* Allocate space for per-array data in the workspace context */ - so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo)); + so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo)); + + /* Allocate space for ORDER procs used to help _bt_checkkeys */ + so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo)); /* Now process each array key */ numArrayKeys = 0; - for (i = 0; i < numberOfKeys; i++) + for (int i = 0; i < numberOfKeys; i++) { + FmgrInfo sortproc; + FmgrInfo *sortprocp = &sortproc; + Oid elemtype; + bool reverse; ArrayType *arrayval; int16 elmlen; bool elmbyval; @@ -271,7 +345,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) int num_nonnulls; int j; - cur = &so->arrayKeyData[i]; + cur = &arrayKeyData[i]; if (!(cur->sk_flags & SK_SEARCHARRAY)) continue; @@ -305,10 +379,21 @@ _bt_preprocess_array_keys(IndexScanDesc scan) /* If there's no non-nulls, the scan qual is unsatisfiable */ if (num_nonnulls == 0) { - numArrayKeys = -1; + so->qual_ok = false; break; } + /* + * Determine the nominal datatype of the array elements. We have to + * support the convention that sk_subtype == InvalidOid means the + * opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + elemtype = cur->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[cur->sk_attno - 1]; + Assert(elemtype == ARR_ELEMTYPE(arrayval)); + /* * If the comparison operator is not equality, then the array qual * degenerates to a simple comparison against the smallest or largest @@ -319,7 +404,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) case BTLessStrategyNumber: case BTLessEqualStrategyNumber: cur->sk_argument = - _bt_find_extreme_element(scan, cur, + _bt_find_extreme_element(scan, cur, elemtype, BTGreaterStrategyNumber, elem_values, num_nonnulls); continue; @@ -329,7 +414,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: cur->sk_argument = - _bt_find_extreme_element(scan, cur, + _bt_find_extreme_element(scan, cur, elemtype, BTLessStrategyNumber, elem_values, num_nonnulls); continue; @@ -339,17 +424,93 @@ _bt_preprocess_array_keys(IndexScanDesc scan) break; } + /* + * We'll need a 3-way ORDER proc to perform binary searches for the + * next matching array element. Set that up now. + * + * Array scan keys with cross-type equality operators will require a + * separate same-type ORDER proc for sorting their array. Otherwise, + * sortproc just points to the same proc used during binary searches. + */ + _bt_setup_array_cmp(scan, cur, elemtype, + &so->orderProcs[i], &sortprocp); + /* * Sort the non-null elements and eliminate any duplicates. We must * sort in the same ordering used by the index column, so that the - * successive primitive indexscans produce data in index order. + * arrays can be advanced in lockstep with the scan's progress through + * the index's key space. */ - num_elems = _bt_sort_array_elements(scan, cur, - (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, + reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0; + num_elems = _bt_sort_array_elements(cur, sortprocp, reverse, elem_values, num_nonnulls); + if (origarrayatt == cur->sk_attno) + { + BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey]; + + /* + * This array scan key is redundant with a previous equality + * operator array scan key. Merge the two arrays together to + * eliminate contradictory non-intersecting elements (or try to). + * + * We merge this next array back into attribute's original array. + */ + Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno); + Assert(arrayKeyData[orig->scan_key].sk_collation == + cur->sk_collation); + if (_bt_merge_arrays(scan, cur, sortprocp, reverse, + origelemtype, elemtype, + orig->elem_values, &orig->num_elems, + elem_values, num_elems)) + { + /* Successfully eliminated this array */ + pfree(elem_values); + + /* + * If no intersecting elements remain in the original array, + * the scan qual is unsatisfiable + */ + if (orig->num_elems == 0) + { + so->qual_ok = false; + break; + } + + /* + * Indicate to _bt_preprocess_keys caller that it must ignore + * this scan key + */ + cur->sk_strategy = InvalidStrategy; + continue; + } + + /* + * Unable to merge this array with previous array due to a lack of + * suitable cross-type opfamily support. Will need to keep both + * scan keys/arrays. + */ + } + else + { + /* + * This array is the first for current index attribute. + * + * If it turns out to not be the last array (that is, if the next + * array is redundantly applied to this same index attribute), + * we'll then treat this array as the attribute's "original" array + * when merging. + */ + origarrayatt = cur->sk_attno; + origarraykey = numArrayKeys; + origelemtype = elemtype; + } + /* * And set up the BTArrayKeyInfo data. + * + * Note: _bt_preprocess_array_keys_final will fix-up each array's + * scan_key field later on, after so->keyData[] has been finalized. */ so->arrayKeys[numArrayKeys].scan_key = i; so->arrayKeys[numArrayKeys].num_elems = num_elems; @@ -360,6 +521,256 @@ _bt_preprocess_array_keys(IndexScanDesc scan) so->numArrayKeys = numArrayKeys; MemoryContextSwitchTo(oldContext); + + return arrayKeyData; +} + +/* + * _bt_preprocess_array_keys_final() -- fix up array scan key references + * + * When _bt_preprocess_array_keys performed initial array preprocessing, it + * set each array's array->scan_key to the array's arrayKeys[] entry offset + * (that also work as references into the original scan->keyData[] array). + * This function handles translation of the scan key references from the + * BTArrayKeyInfo info array, from input scan key references (to the keys in + * scan->keyData[]), into output references (to the keys in so->keyData[]). + * Caller's keyDataMap[] array tells us how to perform this remapping. + * + * Also finalizes so->orderProcs[] for the scan. Arrays already have an ORDER + * proc, which might need to be repositioned to its so->keyData[]-wise offset + * (very much like the remapping that we apply to array->scan_key references). + * Non-array equality strategy scan keys (that survived preprocessing) don't + * yet have an so->orderProcs[] entry, so we set one for them here. + * + * Also converts single-element array scan keys into equivalent non-array + * equality scan keys, which decrements so->numArrayKeys. It's possible that + * this will leave this new btrescan without any arrays at all. This isn't + * necessary for correctness; it's just an optimization. Non-array equality + * scan keys are slightly faster than equivalent array scan keys at runtime. + */ +static void +_bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + int arrayidx = 0; + int last_equal_output_ikey PG_USED_FOR_ASSERTS_ONLY = -1; + + Assert(so->qual_ok); + Assert(so->numArrayKeys); + + for (int output_ikey = 0; output_ikey < so->numberOfKeys; output_ikey++) + { + ScanKey outkey = so->keyData + output_ikey; + int input_ikey; + bool found PG_USED_FOR_ASSERTS_ONLY = false; + + Assert(outkey->sk_strategy != InvalidStrategy); + + if (outkey->sk_strategy != BTEqualStrategyNumber) + continue; + + input_ikey = keyDataMap[output_ikey]; + + Assert(last_equal_output_ikey < output_ikey); + Assert(last_equal_output_ikey < input_ikey); + last_equal_output_ikey = output_ikey; + + /* + * We're lazy about looking up ORDER procs for non-array keys, since + * not all input keys become output keys. Take care of it now. + */ + if (!(outkey->sk_flags & SK_SEARCHARRAY)) + { + Oid elemtype; + + /* No need for an ORDER proc given an IS NULL scan key */ + if (outkey->sk_flags & SK_SEARCHNULL) + continue; + + /* + * A non-required scan key doesn't need an ORDER proc, either + * (unless it's associated with an array, which this one isn't) + */ + if (!(outkey->sk_flags & SK_BT_REQFWD)) + continue; + + elemtype = outkey->sk_subtype; + if (elemtype == InvalidOid) + elemtype = rel->rd_opcintype[outkey->sk_attno - 1]; + + _bt_setup_array_cmp(scan, outkey, elemtype, + &so->orderProcs[output_ikey], NULL); + continue; + } + + /* + * Reorder existing array scan key so->orderProcs[] entries. + * + * Doing this in-place is safe because preprocessing is required to + * output all equality strategy scan keys in original input order + * (among each group of entries against the same index attribute). + * This is also the order that the arrays themselves appear in. + */ + so->orderProcs[output_ikey] = so->orderProcs[input_ikey]; + + /* Fix-up array->scan_key references for arrays */ + for (; arrayidx < so->numArrayKeys; arrayidx++) + { + BTArrayKeyInfo *array = &so->arrayKeys[arrayidx]; + + Assert(array->num_elems > 0); + + if (array->scan_key == input_ikey) + { + /* found it */ + array->scan_key = output_ikey; + found = true; + + /* + * Transform array scan keys that have exactly 1 element + * remaining (following all prior preprocessing) into + * equivalent non-array scan keys. + */ + if (array->num_elems == 1) + { + outkey->sk_flags &= ~SK_SEARCHARRAY; + outkey->sk_argument = array->elem_values[0]; + so->numArrayKeys--; + + /* If we're out of array keys, we can quit right away */ + if (so->numArrayKeys == 0) + return; + + /* Shift other arrays forward */ + memmove(array, array + 1, + sizeof(BTArrayKeyInfo) * + (so->numArrayKeys - arrayidx)); + + /* + * Don't increment arrayidx (there was an entry that was + * just shifted forward to the offset at arrayidx, which + * will still need to be matched) + */ + } + else + { + /* Match found, so done with this array */ + arrayidx++; + } + + break; + } + } + + Assert(found); + } + + /* + * Parallel index scans require space in shared memory to store the + * current array elements (for arrays kept by preprocessing) to schedule + * the next primitive index scan. The underlying structure is protected + * using a spinlock, so defensively limit its size. In practice this can + * only affect parallel scans that use an incomplete opfamily. + */ + if (scan->parallel_scan && so->numArrayKeys > INDEX_MAX_KEYS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("number of array scan keys left by preprocessing (%d) exceeds the maximum allowed by parallel btree index scans (%d)", + so->numArrayKeys, INDEX_MAX_KEYS))); +} + +/* + * _bt_setup_array_cmp() -- Set up array comparison functions + * + * Sets ORDER proc in caller's orderproc argument, which is used during binary + * searches of arrays during the index scan. Also sets a same-type ORDER proc + * in caller's *sortprocp argument, which is used when sorting the array. + * + * Preprocessing calls here with all equality strategy scan keys (when scan + * uses equality array keys), including those not associated with any array. + * See _bt_advance_array_keys for an explanation of why it'll need to treat + * simple scalar equality scan keys as degenerate single element arrays. + * + * Caller should pass an orderproc pointing to space that'll store the ORDER + * proc for the scan, and a *sortprocp pointing to its own separate space. + * When calling here for a non-array scan key, sortprocp arg should be NULL. + * + * In the common case where we don't need to deal with cross-type operators, + * only one ORDER proc is actually required by caller. We'll set *sortprocp + * to point to the same memory that caller's orderproc continues to point to. + * Otherwise, *sortprocp will continue to point to caller's own space. Either + * way, *sortprocp will point to a same-type ORDER proc (since that's the only + * safe way to sort/deduplicate the array associated with caller's scan key). + */ +static void +_bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype, + FmgrInfo *orderproc, FmgrInfo **sortprocp) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + RegProcedure cmp_proc; + Oid opcintype = rel->rd_opcintype[skey->sk_attno - 1]; + + Assert(skey->sk_strategy == BTEqualStrategyNumber); + Assert(OidIsValid(elemtype)); + + /* + * If scankey operator is not a cross-type comparison, we can use the + * cached comparison function; otherwise gotta look it up in the catalogs + */ + if (elemtype == opcintype) + { + /* Set same-type ORDER procs for caller */ + *orderproc = *index_getprocinfo(rel, skey->sk_attno, BTORDER_PROC); + if (sortprocp) + *sortprocp = orderproc; + + return; + } + + /* + * Look up the appropriate cross-type comparison function in the opfamily. + * + * Use the opclass input type as the left hand arg type, and the array + * element type as the right hand arg type (since binary searches use an + * index tuple's attribute value to search for a matching array element). + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but only in cases where it's quite likely that _bt_first + * would fail in just the same way (had we not failed before it could). + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + opcintype, elemtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, opcintype, elemtype, skey->sk_attno, + RelationGetRelationName(rel)); + + /* Set cross-type ORDER proc for caller */ + fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext); + + /* Done if caller doesn't actually have an array they'll need to sort */ + if (!sortprocp) + return; + + /* + * Look up the appropriate same-type comparison function in the opfamily. + * + * Note: it's possible that this would fail, if the opfamily is + * incomplete, but it seems quite unlikely that an opfamily would omit + * non-cross-type comparison procs for any datatype that it supports at + * all. + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + elemtype, elemtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, elemtype, elemtype, + skey->sk_attno, RelationGetRelationName(rel)); + + /* Set same-type ORDER proc for caller */ + fmgr_info_cxt(cmp_proc, *sortprocp, so->arrayContext); } /* @@ -370,27 +781,17 @@ _bt_preprocess_array_keys(IndexScanDesc scan) * least element, or BTGreaterStrategyNumber to get the greatest. */ static Datum -_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, +_bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, Oid elemtype, StrategyNumber strat, Datum *elems, int nelems) { Relation rel = scan->indexRelation; - Oid elemtype, - cmp_op; + Oid cmp_op; RegProcedure cmp_proc; FmgrInfo flinfo; Datum result; int i; - /* - * Determine the nominal datatype of the array elements. We have to - * support the convention that sk_subtype == InvalidOid means the opclass - * input type; this is a hack to simplify life for ScanKeyInit(). - */ - elemtype = skey->sk_subtype; - if (elemtype == InvalidOid) - elemtype = rel->rd_opcintype[skey->sk_attno - 1]; - /* * Look up the appropriate comparison operator in the opfamily. * @@ -399,6 +800,8 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, * non-cross-type comparison operators for any datatype that it supports * at all. */ + Assert(skey->sk_strategy != BTEqualStrategyNumber); + Assert(OidIsValid(elemtype)); cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1], elemtype, elemtype, @@ -433,50 +836,21 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, * The array elements are sorted in-place, and the new number of elements * after duplicate removal is returned. * - * scan and skey identify the index column, whose opfamily determines the - * comparison semantics. If reverse is true, we sort in descending order. + * skey identifies the index column whose opfamily determines the comparison + * semantics, and sortproc is a corresponding ORDER proc. If reverse is true, + * we sort in descending order. */ static int -_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, - bool reverse, +_bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc, bool reverse, Datum *elems, int nelems) { - Relation rel = scan->indexRelation; - Oid elemtype; - RegProcedure cmp_proc; BTSortArrayContext cxt; if (nelems <= 1) return nelems; /* no work to do */ - /* - * Determine the nominal datatype of the array elements. We have to - * support the convention that sk_subtype == InvalidOid means the opclass - * input type; this is a hack to simplify life for ScanKeyInit(). - */ - elemtype = skey->sk_subtype; - if (elemtype == InvalidOid) - elemtype = rel->rd_opcintype[skey->sk_attno - 1]; - - /* - * Look up the appropriate comparison function in the opfamily. - * - * Note: it's possible that this would fail, if the opfamily is - * incomplete, but it seems quite unlikely that an opfamily would omit - * non-cross-type support functions for any datatype that it supports at - * all. - */ - cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], - elemtype, - elemtype, - BTORDER_PROC); - if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) in opfamily %u", - BTORDER_PROC, elemtype, elemtype, - rel->rd_opfamily[skey->sk_attno - 1]); - /* Sort the array elements */ - fmgr_info(cmp_proc, &cxt.flinfo); + cxt.sortproc = sortproc; cxt.collation = skey->sk_collation; cxt.reverse = reverse; qsort_arg(elems, nelems, sizeof(Datum), @@ -487,6 +861,232 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, _bt_compare_array_elements, &cxt); } +/* + * _bt_merge_arrays() -- merge next array's elements into an original array + * + * Called when preprocessing encounters a pair of array equality scan keys, + * both against the same index attribute (during initial array preprocessing). + * Merging reorganizes caller's original array (the left hand arg) in-place, + * without ever copying elements from one array into the other. (Mixing the + * elements together like this would be wrong, since they don't necessarily + * use the same underlying element type, despite all the other similarities.) + * + * Both arrays must have already been sorted and deduplicated by calling + * _bt_sort_array_elements. sortproc is the same-type ORDER proc that was + * just used to sort and deduplicate caller's "next" array. We'll usually be + * able to reuse that order PROC to merge the arrays together now. If not, + * then we'll perform a separate ORDER proc lookup. + * + * If the opfamily doesn't supply a complete set of cross-type ORDER procs we + * may not be able to determine which elements are contradictory. If we have + * the required ORDER proc then we return true (and validly set *nelems_orig), + * guaranteeing that at least the next array can be considered redundant. We + * return false if the required comparisons cannot not be made (caller must + * keep both arrays when this happens). + */ +static bool +_bt_merge_arrays(IndexScanDesc scan, ScanKey skey, FmgrInfo *sortproc, + bool reverse, Oid origelemtype, Oid nextelemtype, + Datum *elems_orig, int *nelems_orig, + Datum *elems_next, int nelems_next) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSortArrayContext cxt; + int nelems_orig_start = *nelems_orig, + nelems_orig_merged = 0; + FmgrInfo *mergeproc = sortproc; + FmgrInfo crosstypeproc; + + Assert(skey->sk_strategy == BTEqualStrategyNumber); + Assert(OidIsValid(origelemtype) && OidIsValid(nextelemtype)); + + if (origelemtype != nextelemtype) + { + RegProcedure cmp_proc; + + /* + * Cross-array-element-type merging is required, so can't just reuse + * sortproc when merging + */ + cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], + origelemtype, nextelemtype, BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + { + /* Can't make the required comparisons */ + return false; + } + + /* We have all we need to determine redundancy/contradictoriness */ + mergeproc = &crosstypeproc; + fmgr_info_cxt(cmp_proc, mergeproc, so->arrayContext); + } + + cxt.sortproc = mergeproc; + cxt.collation = skey->sk_collation; + cxt.reverse = reverse; + + for (int i = 0, j = 0; i < nelems_orig_start && j < nelems_next;) + { + Datum *oelem = elems_orig + i, + *nelem = elems_next + j; + int res = _bt_compare_array_elements(oelem, nelem, &cxt); + + if (res == 0) + { + elems_orig[nelems_orig_merged++] = *oelem; + i++; + j++; + } + else if (res < 0) + i++; + else /* res > 0 */ + j++; + } + + *nelems_orig = nelems_orig_merged; + + return true; +} + +/* + * Compare an array scan key to a scalar scan key, eliminating contradictory + * array elements such that the scalar scan key becomes redundant. + * + * Array elements can be eliminated as contradictory when excluded by some + * other operator on the same attribute. For example, with an index scan qual + * "WHERE a IN (1, 2, 3) AND a < 2", all array elements except the value "1" + * are eliminated, and the < scan key is eliminated as redundant. Cases where + * every array element is eliminated by a redundant scalar scan key have an + * unsatisfiable qual, which we handle by setting *qual_ok=false for caller. + * + * If the opfamily doesn't supply a complete set of cross-type ORDER procs we + * may not be able to determine which elements are contradictory. If we have + * the required ORDER proc then we return true (and validly set *qual_ok), + * guaranteeing that at least the scalar scan key can be considered redundant. + * We return false if the comparison could not be made (caller must keep both + * scan keys when this happens). + */ +static bool +_bt_compare_array_scankey_args(IndexScanDesc scan, ScanKey arraysk, ScanKey skey, + FmgrInfo *orderproc, BTArrayKeyInfo *array, + bool *qual_ok) +{ + Relation rel = scan->indexRelation; + Oid opcintype = rel->rd_opcintype[arraysk->sk_attno - 1]; + int cmpresult = 0, + cmpexact = 0, + matchelem, + new_nelems = 0; + FmgrInfo crosstypeproc; + FmgrInfo *orderprocp = orderproc; + + Assert(arraysk->sk_attno == skey->sk_attno); + Assert(array->num_elems > 0); + Assert(!(arraysk->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert((arraysk->sk_flags & SK_SEARCHARRAY) && + arraysk->sk_strategy == BTEqualStrategyNumber); + Assert(!(skey->sk_flags & (SK_ISNULL | SK_ROW_HEADER | SK_ROW_MEMBER))); + Assert(!(skey->sk_flags & SK_SEARCHARRAY) || + skey->sk_strategy != BTEqualStrategyNumber); + + /* + * _bt_binsrch_array_skey searches an array for the entry best matching a + * datum of opclass input type for the index's attribute (on-disk type). + * We can reuse the array's ORDER proc whenever the non-array scan key's + * type is a match for the corresponding attribute's input opclass type. + * Otherwise, we have to do another ORDER proc lookup so that our call to + * _bt_binsrch_array_skey applies the correct comparator. + * + * Note: we have to support the convention that sk_subtype == InvalidOid + * means the opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + if (skey->sk_subtype != opcintype && skey->sk_subtype != InvalidOid) + { + RegProcedure cmp_proc; + Oid arraysk_elemtype; + + /* + * Need an ORDER proc lookup to detect redundancy/contradictoriness + * with this pair of scankeys. + * + * Scalar scan key's argument will be passed to _bt_compare_array_skey + * as its tupdatum/lefthand argument (rhs arg is for array elements). + */ + arraysk_elemtype = arraysk->sk_subtype; + if (arraysk_elemtype == InvalidOid) + arraysk_elemtype = rel->rd_opcintype[arraysk->sk_attno - 1]; + cmp_proc = get_opfamily_proc(rel->rd_opfamily[arraysk->sk_attno - 1], + skey->sk_subtype, arraysk_elemtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + { + /* Can't make the comparison */ + *qual_ok = false; /* suppress compiler warnings */ + return false; + } + + /* We have all we need to determine redundancy/contradictoriness */ + orderprocp = &crosstypeproc; + fmgr_info(cmp_proc, orderprocp); + } + + matchelem = _bt_binsrch_array_skey(orderprocp, false, + NoMovementScanDirection, + skey->sk_argument, false, array, + arraysk, &cmpresult); + + switch (skey->sk_strategy) + { + case BTLessStrategyNumber: + cmpexact = 1; /* exclude exact match, if any */ + /* FALL THRU */ + case BTLessEqualStrategyNumber: + if (cmpresult >= cmpexact) + matchelem++; + /* Resize, keeping elements from the start of the array */ + new_nelems = matchelem; + break; + case BTEqualStrategyNumber: + if (cmpresult != 0) + { + /* qual is unsatisfiable */ + new_nelems = 0; + } + else + { + /* Shift matching element to the start of the array, resize */ + array->elem_values[0] = array->elem_values[matchelem]; + new_nelems = 1; + } + break; + case BTGreaterEqualStrategyNumber: + cmpexact = 1; /* include exact match, if any */ + /* FALL THRU */ + case BTGreaterStrategyNumber: + if (cmpresult >= cmpexact) + matchelem++; + /* Shift matching elements to the start of the array, resize */ + new_nelems = array->num_elems - matchelem; + memmove(array->elem_values, array->elem_values + matchelem, + sizeof(Datum) * new_nelems); + break; + default: + elog(ERROR, "unrecognized StrategyNumber: %d", + (int) skey->sk_strategy); + break; + } + + Assert(new_nelems >= 0); + Assert(new_nelems <= array->num_elems); + + array->num_elems = new_nelems; + *qual_ok = new_nelems > 0; + + return true; +} + /* * qsort_arg comparator for sorting array elements */ @@ -498,7 +1098,7 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) BTSortArrayContext *cxt = (BTSortArrayContext *) arg; int32 compare; - compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo, + compare = DatumGetInt32(FunctionCall2Coll(cxt->sortproc, cxt->collation, da, db)); if (cxt->reverse) @@ -506,11 +1106,233 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) return compare; } +/* + * _bt_compare_array_skey() -- apply array comparison function + * + * Compares caller's tuple attribute value to a scan key/array element. + * Helper function used during binary searches of SK_SEARCHARRAY arrays. + * + * This routine returns: + * <0 if tupdatum < arrdatum; + * 0 if tupdatum == arrdatum; + * >0 if tupdatum > arrdatum. + * + * This is essentially the same interface as _bt_compare: both functions + * compare the value that they're searching for to a binary search pivot. + * However, unlike _bt_compare, this function's "tuple argument" comes first, + * while its "array/scankey argument" comes second. +*/ +static inline int32 +_bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur) +{ + int32 result = 0; + + Assert(cur->sk_strategy == BTEqualStrategyNumber); + + if (tupnull) /* NULL tupdatum */ + { + if (cur->sk_flags & SK_ISNULL) + result = 0; /* NULL "=" NULL */ + else if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * Like _bt_compare, we need to be careful of cross-type comparisons, + * so the left value has to be the value that came from an index tuple + */ + result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation, + tupdatum, arrdatum)); + + /* + * We flip the sign by following the obvious rule: flip whenever the + * column is a DESC column. + * + * _bt_compare does it the wrong way around (flip when *ASC*) in order + * to compensate for passing its orderproc arguments backwards. We + * don't need to play these games because we find it natural to pass + * tupdatum as the left value (and arrdatum as the right value). + */ + if (cur->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(result); + } + + return result; +} + +/* + * _bt_binsrch_array_skey() -- Binary search for next matching array key + * + * Returns an index to the first array element >= caller's tupdatum argument. + * This convention is more natural for forwards scan callers, but that can't + * really matter to backwards scan callers. Both callers require handling for + * the case where the match we return is < tupdatum, and symmetric handling + * for the case where our best match is > tupdatum. + * + * Also sets *set_elem_result to the result _bt_compare_array_skey returned + * when we used it to compare the matching array element to tupdatum/tupnull. + * + * cur_elem_trig indicates if array advancement was triggered by this array's + * scan key, and that the array is for a required scan key. We can apply this + * information to find the next matching array element in the current scan + * direction using far fewer comparisons (fewer on average, compared to naive + * binary search). This scheme takes advantage of an important property of + * required arrays: required arrays always advance in lockstep with the index + * scan's progress through the index's key space. + */ +static int +_bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_trig, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *set_elem_result) +{ + int low_elem = 0, + mid_elem = -1, + high_elem = array->num_elems - 1, + result = 0; + Datum arrdatum; + + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_strategy == BTEqualStrategyNumber); + + if (cur_elem_trig) + { + Assert(!ScanDirectionIsNoMovement(dir)); + Assert(cur->sk_flags & SK_BT_REQFWD); + + /* + * When the scan key that triggered array advancement is a required + * array scan key, it is now certain that the current array element + * (plus all prior elements relative to the current scan direction) + * cannot possibly be at or ahead of the corresponding tuple value. + * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which + * makes sure this is true as a condition of advancing the arrays.) + * + * This makes it safe to exclude array elements up to and including + * the former-current array element from our search. + * + * Separately, when array advancement was triggered by a required scan + * key, the array element immediately after the former-current element + * is often either an exact tupdatum match, or a "close by" near-match + * (a near-match tupdatum is one whose key space falls _between_ the + * former-current and new-current array elements). We'll detect both + * cases via an optimistic comparison of the new search lower bound + * (or new search upper bound in the case of backwards scans). + */ + if (ScanDirectionIsForward(dir)) + { + low_elem = array->cur_elem + 1; /* old cur_elem exhausted */ + + /* Compare prospective new cur_elem (also the new lower bound) */ + if (high_elem >= low_elem) + { + arrdatum = array->elem_values[low_elem]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result <= 0) + { + /* Optimistic comparison optimization worked out */ + *set_elem_result = result; + return low_elem; + } + mid_elem = low_elem; + low_elem++; /* this cur_elem exhausted, too */ + } + + if (high_elem < low_elem) + { + /* Caller needs to perform "beyond end" array advancement */ + *set_elem_result = 1; + return high_elem; + } + } + else + { + high_elem = array->cur_elem - 1; /* old cur_elem exhausted */ + + /* Compare prospective new cur_elem (also the new upper bound) */ + if (high_elem >= low_elem) + { + arrdatum = array->elem_values[high_elem]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result >= 0) + { + /* Optimistic comparison optimization worked out */ + *set_elem_result = result; + return high_elem; + } + mid_elem = high_elem; + high_elem--; /* this cur_elem exhausted, too */ + } + + if (high_elem < low_elem) + { + /* Caller needs to perform "beyond end" array advancement */ + *set_elem_result = -1; + return low_elem; + } + } + } + + while (high_elem > low_elem) + { + mid_elem = low_elem + ((high_elem - low_elem) / 2); + arrdatum = array->elem_values[mid_elem]; + + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result == 0) + { + /* + * It's safe to quit as soon as we see an equal array element. + * This often saves an extra comparison or two... + */ + low_elem = mid_elem; + break; + } + + if (result > 0) + low_elem = mid_elem + 1; + else + high_elem = mid_elem; + } + + /* + * ...but our caller also cares about how its searched-for tuple datum + * compares to the low_elem datum. Must always set *set_elem_result with + * the result of that comparison specifically. + */ + if (low_elem != mid_elem) + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + array->elem_values[low_elem], cur); + + *set_elem_result = result; + + return low_elem; +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * * Set up the cur_elem counters and fill in the first sk_argument value for - * each array scankey. We can't do this until we know the scan direction. + * each array scankey. */ void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) @@ -518,159 +1340,1132 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; int i; + Assert(so->numArrayKeys); + Assert(so->qual_ok); + for (i = 0; i < so->numArrayKeys; i++) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + ScanKey skey = &so->keyData[curArrayKey->scan_key]; Assert(curArrayKey->num_elems > 0); + Assert(skey->sk_flags & SK_SEARCHARRAY); + if (ScanDirectionIsBackward(dir)) curArrayKey->cur_elem = curArrayKey->num_elems - 1; else curArrayKey->cur_elem = 0; skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; } - - so->arraysStarted = true; + so->scanBehind = false; } /* - * _bt_advance_array_keys() -- Advance to next set of array elements + * _bt_advance_array_keys_increment() -- Advance to next set of array elements + * + * Advances the array keys by a single increment in the current scan + * direction. When there are multiple array keys this can roll over from the + * lowest order array to higher order arrays. * * Returns true if there is another set of values to consider, false if not. * On true result, the scankeys are initialized with the next set of values. + * On false result, the scankeys stay the same, and the array keys are not + * advanced (every array remains at its final element for scan direction). */ -bool -_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) +static bool +_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - bool found = false; - int i; /* * We must advance the last array key most quickly, since it will * correspond to the lowest-order index column among the available - * qualifications. This is necessary to ensure correct ordering of output - * when there are multiple array keys. + * qualifications */ - for (i = so->numArrayKeys - 1; i >= 0; i--) + for (int i = so->numArrayKeys - 1; i >= 0; i--) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; + ScanKey skey = &so->keyData[curArrayKey->scan_key]; int cur_elem = curArrayKey->cur_elem; int num_elems = curArrayKey->num_elems; + bool rolled = false; - if (ScanDirectionIsBackward(dir)) + if (ScanDirectionIsForward(dir) && ++cur_elem >= num_elems) { - if (--cur_elem < 0) - { - cur_elem = num_elems - 1; - found = false; /* need to advance next array key */ - } - else - found = true; + cur_elem = 0; + rolled = true; } - else + else if (ScanDirectionIsBackward(dir) && --cur_elem < 0) { - if (++cur_elem >= num_elems) - { - cur_elem = 0; - found = false; /* need to advance next array key */ - } - else - found = true; + cur_elem = num_elems - 1; + rolled = true; } curArrayKey->cur_elem = cur_elem; skey->sk_argument = curArrayKey->elem_values[cur_elem]; - if (found) - break; - } + if (!rolled) + return true; - /* advance parallel scan */ - if (scan->parallel_scan != NULL) - _bt_parallel_advance_array_keys(scan); + /* Need to advance next array key, if any */ + } /* - * When no new array keys were found, the scan is "past the end" of the - * array keys. _bt_start_array_keys can still "restart" the array keys if - * a rescan is required. + * The array keys are now exhausted. (There isn't actually a distinct + * state that represents array exhaustion, since index scans don't always + * end after btgettuple returns "false".) + * + * Restore the array keys to the state they were in immediately before we + * were called. This ensures that the arrays only ever ratchet in the + * current scan direction. Without this, scans would overlook matching + * tuples if and when the scan's direction was subsequently reversed. */ - if (!found) - so->arraysStarted = false; + _bt_start_array_keys(scan, -dir); - return found; + return false; } /* - * _bt_mark_array_keys() -- Handle array keys during btmarkpos + * _bt_rewind_nonrequired_arrays() -- Rewind non-required arrays * - * Save the current state of the array keys as the "mark" position. + * Called when _bt_advance_array_keys decides to start a new primitive index + * scan on the basis of the current scan position being before the position + * that _bt_first is capable of repositioning the scan to by applying an + * inequality operator required in the opposite-to-scan direction only. + * + * Although equality strategy scan keys (for both arrays and non-arrays alike) + * are either marked required in both directions or in neither direction, + * there is a sense in which non-required arrays behave like required arrays. + * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)", + * the scan key on "c" is non-required, but nevertheless enables positioning + * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the + * first descent of the tree by _bt_first. Later on, there could also be a + * second descent, that places the scan right before tuples >= "(200, 3, 5)". + * _bt_first must never be allowed to build an insertion scan key whose "c" + * entry is set to a value other than 5, the "c" array's first element/value. + * (Actually, it's the first in the current scan direction. This example uses + * a forward scan.) + * + * Calling here resets the array scan key elements for the scan's non-required + * arrays. This is strictly necessary for correctness in a subset of cases + * involving "required in opposite direction"-triggered primitive index scans. + * Not all callers are at risk of _bt_first using a non-required array like + * this, but advancement always resets the arrays when another primitive scan + * is scheduled, just to keep things simple. Array advancement even makes + * sure to reset non-required arrays during scans that have no inequalities. + * (Advancement still won't call here when there are no inequalities, though + * that's just because it's all handled indirectly instead.) + * + * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that + * everybody got this right. */ -void -_bt_mark_array_keys(IndexScanDesc scan) +static void +_bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - int i; + int arrayidx = 0; - for (i = 0; i < so->numArrayKeys; i++) + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + int first_elem_dir; - curArrayKey->mark_elem = curArrayKey->cur_elem; + if (!(cur->sk_flags & SK_SEARCHARRAY) || + cur->sk_strategy != BTEqualStrategyNumber) + continue; + + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + + if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) + continue; + + if (ScanDirectionIsForward(dir)) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array->cur_elem != first_elem_dir) + { + array->cur_elem = first_elem_dir; + cur->sk_argument = array->elem_values[first_elem_dir]; + } } } /* - * _bt_restore_array_keys() -- Handle array keys during btrestrpos + * _bt_tuple_before_array_skeys() -- too early to advance required arrays? * - * Restore the array keys to where they were when the mark was set. + * We always compare the tuple using the current array keys (which we assume + * are already set in so->keyData[]). readpagetup indicates if tuple is the + * scan's current _bt_readpage-wise tuple. + * + * readpagetup callers must only call here when _bt_check_compare already set + * continuescan=false. We help these callers deal with _bt_check_compare's + * inability to distinguishing between the < and > cases (it uses equality + * operator scan keys, whereas we use 3-way ORDER procs). These callers pass + * a _bt_check_compare-set sktrig value that indicates which scan key + * triggered the call (!readpagetup callers just pass us sktrig=0 instead). + * This information allows us to avoid wastefully checking earlier scan keys + * that were already deemed to have been satisfied inside _bt_check_compare. + * + * Returns false when caller's tuple is >= the current required equality scan + * keys (or <=, in the case of backwards scans). This happens to readpagetup + * callers when the scan has reached the point of needing its array keys + * advanced; caller will need to advance required and non-required arrays at + * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over. + * (When we return false to readpagetup callers, tuple can only be == current + * required equality scan keys when caller's sktrig indicates that the arrays + * need to be advanced due to an unsatisfied required inequality key trigger.) + * + * Returns true when caller passes a tuple that is < the current set of + * equality keys for the most significant non-equal required scan key/column + * (or > the keys, during backwards scans). This happens to readpagetup + * callers when tuple is still before the start of matches for the scan's + * required equality strategy scan keys. (sktrig can't have indicated that an + * inequality strategy scan key wasn't satisfied in _bt_check_compare when we + * return true. In fact, we automatically return false when passed such an + * inequality sktrig by readpagetup callers -- _bt_check_compare's initial + * continuescan=false doesn't really need to be confirmed here by us.) + * + * !readpagetup callers optionally pass us *scanBehind, which tracks whether + * any missing truncated attributes might have affected array advancement + * (compared to what would happen if it was shown the first non-pivot tuple on + * the page to the right of caller's finaltup/high key tuple instead). It's + * only possible that we'll set *scanBehind to true when caller passes us a + * pivot tuple (with truncated -inf attributes) that we return false for. */ -void -_bt_restore_array_keys(IndexScanDesc scan) +static bool +_bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, TupleDesc tupdesc, int tupnatts, + bool readpagetup, int sktrig, bool *scanBehind) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - bool changed = false; - int i; - /* Restore each array key to its position when the mark was set */ - for (i = 0; i < so->numArrayKeys; i++) + Assert(so->numArrayKeys); + Assert(so->numberOfKeys); + Assert(sktrig == 0 || readpagetup); + Assert(!readpagetup || scanBehind == NULL); + + if (scanBehind) + *scanBehind = false; + + for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) { - BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; - ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; - int mark_elem = curArrayKey->mark_elem; + ScanKey cur = so->keyData + ikey; + Datum tupdatum; + bool tupnull; + int32 result; - if (curArrayKey->cur_elem != mark_elem) + /* readpagetup calls require one ORDER proc comparison (at most) */ + Assert(!readpagetup || ikey == sktrig); + + /* + * Once we reach a non-required scan key, we're completely done. + * + * Note: we deliberately don't consider the scan direction here. + * _bt_advance_array_keys caller requires that we track *scanBehind + * without concern for scan direction. + */ + if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0) { - curArrayKey->cur_elem = mark_elem; - skey->sk_argument = curArrayKey->elem_values[mark_elem]; - changed = true; + Assert(!readpagetup); + Assert(ikey > sktrig || ikey == 0); + return false; + } + + if (cur->sk_attno > tupnatts) + { + Assert(!readpagetup); + + /* + * When we reach a high key's truncated attribute, assume that the + * tuple attribute's value is >= the scan's equality constraint + * scan keys (but set *scanBehind to let interested callers know + * that a truncated attribute might have affected our answer). + */ + if (scanBehind) + *scanBehind = true; + + return false; + } + + /* + * Deal with inequality strategy scan keys that _bt_check_compare set + * continuescan=false for + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + { + /* + * When _bt_check_compare indicated that a required inequality + * scan key wasn't satisfied, there's no need to verify anything; + * caller always calls _bt_advance_array_keys with this sktrig. + */ + if (readpagetup) + return false; + + /* + * Otherwise we can't give up, since we must check all required + * scan keys (required in either direction) in order to correctly + * track *scanBehind for caller + */ + continue; + } + + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + + /* + * Does this comparison indicate that caller must _not_ advance the + * scan's arrays just yet? + */ + if ((ScanDirectionIsForward(dir) && result < 0) || + (ScanDirectionIsBackward(dir) && result > 0)) + return true; + + /* + * Does this comparison indicate that caller should now advance the + * scan's arrays? (Must be if we get here during a readpagetup call.) + */ + if (readpagetup || result != 0) + { + Assert(result != 0); + return false; + } + + /* + * Inconclusive -- need to check later scan keys, too. + * + * This must be a finaltup precheck, or a call made from an assertion. + */ + Assert(result == 0); + } + + Assert(!readpagetup); + + return false; +} + +/* + * _bt_start_prim_scan() -- start scheduled primitive index scan? + * + * Returns true if _bt_checkkeys scheduled another primitive index scan, just + * as the last one ended. Otherwise returns false, indicating that the array + * keys are now fully exhausted. + * + * Only call here during scans with one or more equality type array scan keys, + * after _bt_first or _bt_next return false. + */ +bool +_bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(so->numArrayKeys); + + /* scanBehind flag doesn't persist across primitive index scans - reset */ + so->scanBehind = false; + + /* + * Array keys are advanced within _bt_checkkeys when the scan reaches the + * leaf level (more precisely, they're advanced when the scan reaches the + * end of each distinct set of array elements). This process avoids + * repeat access to leaf pages (across multiple primitive index scans) by + * advancing the scan's array keys when it allows the primitive index scan + * to find nearby matching tuples (or when it eliminates ranges of array + * key space that can't possibly be satisfied by any index tuple). + * + * _bt_checkkeys sets a simple flag variable to schedule another primitive + * index scan. The flag tells us what to do. + * + * We cannot rely on _bt_first always reaching _bt_checkkeys. There are + * various cases where that won't happen. For example, if the index is + * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys. + * We also don't expect a call to _bt_checkkeys during searches for a + * non-existent value that happens to be lower/higher than any existing + * value in the index. + * + * We don't require special handling for these cases -- we don't need to + * be explicitly instructed to _not_ perform another primitive index scan. + * It's up to code under the control of _bt_first to always set the flag + * when another primitive index scan will be required. + * + * This works correctly, even with the tricky cases listed above, which + * all involve access to leaf pages "near the boundaries of the key space" + * (whether it's from a leftmost/rightmost page, or an imaginary empty + * leaf root page). If _bt_checkkeys cannot be reached by a primitive + * index scan for one set of array keys, then it also won't be reached for + * any later set ("later" in terms of the direction that we scan the index + * and advance the arrays). The array keys won't have advanced in these + * cases, but that's the correct behavior (even _bt_advance_array_keys + * won't always advance the arrays at the point they become "exhausted"). + */ + if (so->needPrimScan) + { + Assert(_bt_verify_arrays_bt_first(scan, dir)); + + /* + * Flag was set -- must call _bt_first again, which will reset the + * scan's needPrimScan flag + */ + return true; + } + + /* The top-level index scan ran out of tuples in this scan direction */ + if (scan->parallel_scan != NULL) + _bt_parallel_done(scan); + + return false; +} + +/* + * _bt_advance_array_keys() -- Advance array elements using a tuple + * + * The scan always gets a new qual as a consequence of calling here (except + * when we determine that the top-level scan has run out of matching tuples). + * All later _bt_check_compare calls also use the same new qual that was first + * used here (at least until the next call here advances the keys once again). + * It's convenient to structure _bt_check_compare rechecks of caller's tuple + * (using the new qual) as one the steps of advancing the scan's array keys, + * so this function works as a wrapper around _bt_check_compare. + * + * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the + * caller, and return a boolean indicating if caller's tuple satisfies the + * scan's new qual. But unlike _bt_check_compare, we set so->needPrimScan + * when we set continuescan=false, indicating if a new primitive index scan + * has been scheduled (otherwise, the top-level scan has run out of tuples in + * the current scan direction). + * + * Caller must use _bt_tuple_before_array_skeys to determine if the current + * place in the scan is >= the current array keys _before_ calling here. + * We're responsible for ensuring that caller's tuple is <= the newly advanced + * required array keys once we return. We try to find an exact match, but + * failing that we'll advance the array keys to whatever set of array elements + * comes next in the key space for the current scan direction. Required array + * keys "ratchet forwards" (or backwards). They can only advance as the scan + * itself advances through the index/key space. + * + * (The rules are the same for backwards scans, except that the operators are + * flipped: just replace the precondition's >= operator with a <=, and the + * postcondition's <= operator with with a >=. In other words, just swap the + * precondition with the postcondition.) + * + * We also deal with "advancing" non-required arrays here. Callers whose + * sktrig scan key is non-required specify sktrig_required=false. These calls + * are the only exception to the general rule about always advancing the + * required array keys (the scan may not even have a required array). These + * callers should just pass a NULL pstate (since there is never any question + * of stopping the scan). No call to _bt_tuple_before_array_skeys is required + * ahead of these calls (it's already clear that any required scan keys must + * be satisfied by caller's tuple). + * + * Note that we deal with non-array required equality strategy scan keys as + * degenerate single element arrays here. Obviously, they can never really + * advance in the way that real arrays can, but they must still affect how we + * advance real array scan keys (exactly like true array equality scan keys). + * We have to keep around a 3-way ORDER proc for these (using the "=" operator + * won't do), since in general whether the tuple is < or > _any_ unsatisfied + * required equality key influences how the scan's real arrays must advance. + * + * Note also that we may sometimes need to advance the array keys when the + * existing required array keys (and other required equality keys) are already + * an exact match for every corresponding value from caller's tuple. We must + * do this for inequalities that _bt_check_compare set continuescan=false for. + * They'll advance the array keys here, just like any other scan key that + * _bt_check_compare stops on. (This can even happen _after_ we advance the + * array keys, in which case we'll advance the array keys a second time. That + * way _bt_checkkeys caller always has its required arrays advance to the + * maximum possible extent that its tuple will allow.) + */ +static bool +_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + int sktrig, bool sktrig_required) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = pstate ? pstate->dir : ForwardScanDirection; + int arrayidx = 0; + bool beyond_end_advance = false, + has_required_opposite_direction_only = false, + oppodir_inequality_sktrig = false, + all_required_satisfied = true, + all_satisfied = true; + + if (sktrig_required) + { + /* + * Precondition array state assertion + */ + Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, + tupnatts, false, 0, NULL)); + + so->scanBehind = false; /* reset */ + + /* + * Required scan key wasn't satisfied, so required arrays will have to + * advance. Invalidate page-level state that tracks whether the + * scan's required-in-opposite-direction-only keys are known to be + * satisfied by page's remaining tuples. + */ + pstate->firstmatch = false; + + /* Shouldn't have to invalidate 'prechecked', though */ + Assert(!pstate->prechecked); + + /* + * Once we return we'll have a new set of required array keys, so + * reset state used by "look ahead" optimization + */ + pstate->rechecks = 0; + pstate->targetdistance = 0; + } + + Assert(_bt_verify_keys_with_arraykeys(scan)); + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + Datum tupdatum; + bool required = false, + required_opposite_direction_only = false, + tupnull; + int32 result; + int set_elem = 0; + + if (cur->sk_strategy == BTEqualStrategyNumber) + { + /* Manage array state */ + if (cur->sk_flags & SK_SEARCHARRAY) + { + array = &so->arrayKeys[arrayidx++]; + Assert(array->scan_key == ikey); + } + } + else + { + /* + * Are any inequalities required in the opposite direction only + * present here? + */ + if (((ScanDirectionIsForward(dir) && + (cur->sk_flags & (SK_BT_REQBKWD))) || + (ScanDirectionIsBackward(dir) && + (cur->sk_flags & (SK_BT_REQFWD))))) + has_required_opposite_direction_only = + required_opposite_direction_only = true; + } + + /* Optimization: skip over known-satisfied scan keys */ + if (ikey < sktrig) + continue; + + if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) + { + Assert(sktrig_required); + + required = true; + + if (cur->sk_attno > tupnatts) + { + /* Set this just like _bt_tuple_before_array_skeys */ + Assert(sktrig < ikey); + so->scanBehind = true; + } + } + + /* + * Handle a required non-array scan key that the initial call to + * _bt_check_compare indicated triggered array advancement, if any. + * + * The non-array scan key's strategy will be <, <=, or = during a + * forwards scan (or any one of =, >=, or > during a backwards scan). + * It follows that the corresponding tuple attribute's value must now + * be either > or >= the scan key value (for backwards scans it must + * be either < or <= that value). + * + * If this is a required equality strategy scan key, this is just an + * optimization; _bt_tuple_before_array_skeys already confirmed that + * this scan key places us ahead of caller's tuple. There's no need + * to repeat that work now. (The same underlying principle also gets + * applied by the cur_elem_trig optimization used to speed up searches + * for the next array element.) + * + * If this is a required inequality strategy scan key, we _must_ rely + * on _bt_check_compare like this; we aren't capable of directly + * evaluating required inequality strategy scan keys here, on our own. + */ + if (ikey == sktrig && !array) + { + Assert(sktrig_required && required && all_required_satisfied); + + /* Use "beyond end" advancement. See below for an explanation. */ + beyond_end_advance = true; + all_satisfied = all_required_satisfied = false; + + /* + * Set a flag that remembers that this was an inequality required + * in the opposite scan direction only, that nevertheless + * triggered the call here. + * + * This only happens when an inequality operator (which must be + * strict) encounters a group of NULLs that indicate the end of + * non-NULL values for tuples in the current scan direction. + */ + if (unlikely(required_opposite_direction_only)) + oppodir_inequality_sktrig = true; + + continue; + } + + /* + * Nothing more for us to do with an inequality strategy scan key that + * wasn't the one that _bt_check_compare stopped on, though. + * + * Note: if our later call to _bt_check_compare (to recheck caller's + * tuple) sets continuescan=false due to finding this same inequality + * unsatisfied (possible when it's required in the scan direction), + * we'll deal with it via a recursive "second pass" call. + */ + else if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + + /* + * Nothing for us to do with an equality strategy scan key that isn't + * marked required, either -- unless it's a non-required array + */ + else if (!required && !array) + continue; + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose binary search triggered "beyond end of array + * element" array advancement due to encountering a tuple attribute + * value > the closest matching array key (or < for backwards scans). + */ + if (beyond_end_advance) + { + int final_elem_dir; + + if (ScanDirectionIsBackward(dir) || !array) + final_elem_dir = 0; + else + final_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != final_elem_dir) + { + array->cur_elem = final_elem_dir; + cur->sk_argument = array->elem_values[final_elem_dir]; + } + + continue; + } + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose tuple attribute was < the closest matching + * array key when we dealt with it (or > for backwards scans). + * + * This earlier required array key already puts us ahead of caller's + * tuple in the key space (for the current scan direction). We must + * make sure that subsequent lower-order array keys do not put us too + * far ahead (ahead of tuples that have yet to be seen by our caller). + * For example, when a tuple "(a, b) = (42, 5)" advances the array + * keys on "a" from 40 to 45, we must also set "b" to whatever the + * first array element for "b" is. It would be wrong to allow "b" to + * be set based on the tuple value. + * + * Perform the same steps with truncated high key attributes. You can + * think of this as a "binary search" for the element closest to the + * value -inf. Again, the arrays must never get ahead of the scan. + */ + if (!all_required_satisfied || cur->sk_attno > tupnatts) + { + int first_elem_dir; + + if (ScanDirectionIsForward(dir) || !array) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != first_elem_dir) + { + array->cur_elem = first_elem_dir; + cur->sk_argument = array->elem_values[first_elem_dir]; + } + + continue; + } + + /* + * Search in scankey's array for the corresponding tuple attribute + * value from caller's tuple + */ + tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); + + if (array) + { + bool cur_elem_trig = (sktrig_required && ikey == sktrig); + + /* + * Binary search for closest match that's available from the array + */ + set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], + cur_elem_trig, dir, + tupdatum, tupnull, array, cur, + &result); + + Assert(set_elem >= 0 && set_elem < array->num_elems); + } + else + { + Assert(sktrig_required && required); + + /* + * This is a required non-array equality strategy scan key, which + * we'll treat as a degenerate single element array. + * + * This scan key's imaginary "array" can't really advance, but it + * can still roll over like any other array. (Actually, this is + * no different to real single value arrays, which never advance + * without rolling over -- they can never truly advance, either.) + */ + result = _bt_compare_array_skey(&so->orderProcs[ikey], + tupdatum, tupnull, + cur->sk_argument, cur); + } + + /* + * Consider "beyond end of array element" array advancement. + * + * When the tuple attribute value is > the closest matching array key + * (or < in the backwards scan case), we need to ratchet this array + * forward (backward) by one increment, so that caller's tuple ends up + * being < final array value instead (or > final array value instead). + * This process has to work for all of the arrays, not just this one: + * it must "carry" to higher-order arrays when the set_elem that we + * just found happens to be the final one for the scan's direction. + * Incrementing (decrementing) set_elem itself isn't good enough. + * + * Our approach is to provisionally use set_elem as if it was an exact + * match now, then set each later/less significant array to whatever + * its final element is. Once outside the loop we'll then "increment + * this array's set_elem" by calling _bt_advance_array_keys_increment. + * That way the process rolls over to higher order arrays as needed. + * + * Under this scheme any required arrays only ever ratchet forwards + * (or backwards), and always do so to the maximum possible extent + * that we can know will be safe without seeing the scan's next tuple. + * We don't need any special handling for required scan keys that lack + * a real array to advance, nor for redundant scan keys that couldn't + * be eliminated by _bt_preprocess_keys. It won't matter if some of + * our "true" array scan keys (or even all of them) are non-required. + */ + if (required && + ((ScanDirectionIsForward(dir) && result > 0) || + (ScanDirectionIsBackward(dir) && result < 0))) + beyond_end_advance = true; + + Assert(all_required_satisfied && all_satisfied); + if (result != 0) + { + /* + * Track whether caller's tuple satisfies our new post-advancement + * qual, for required scan keys, as well as for the entire set of + * interesting scan keys (all required scan keys plus non-required + * array scan keys are considered interesting.) + */ + all_satisfied = false; + if (required) + all_required_satisfied = false; + else + { + /* + * There's no need to advance the arrays using the best + * available match for a non-required array. Give up now. + * (Though note that sktrig_required calls still have to do + * all the usual post-advancement steps, including the recheck + * call to _bt_check_compare.) + */ + break; + } + } + + /* Advance array keys, even when set_elem isn't an exact match */ + if (array && array->cur_elem != set_elem) + { + array->cur_elem = set_elem; + cur->sk_argument = array->elem_values[set_elem]; } } /* - * If we changed any keys, we must redo _bt_preprocess_keys. That might - * sound like overkill, but in cases with multiple keys per index column - * it seems necessary to do the full set of pushups. - * - * Also do this whenever the scan's set of array keys "wrapped around" at - * the end of the last primitive index scan. There won't have been a call - * to _bt_preprocess_keys from some other place following wrap around, so - * we do it for ourselves. + * Advance the array keys incrementally whenever "beyond end of array + * element" array advancement happens, so that advancement will carry to + * higher-order arrays (might exhaust all the scan's arrays instead, which + * ends the top-level scan). */ - if (changed || !so->arraysStarted) - { - _bt_preprocess_keys(scan); - /* The mark should have been set on a consistent set of keys... */ - Assert(so->qual_ok); - } -} + if (beyond_end_advance && !_bt_advance_array_keys_increment(scan, dir)) + goto end_toplevel_scan; + Assert(_bt_verify_keys_with_arraykeys(scan)); + + /* + * Does tuple now satisfy our new qual? Recheck with _bt_check_compare. + * + * Calls triggered by an unsatisfied required scan key, whose tuple now + * satisfies all required scan keys, but not all nonrequired array keys, + * will still require a recheck call to _bt_check_compare. They'll still + * need its "second pass" handling of required inequality scan keys. + * (Might have missed a still-unsatisfied required inequality scan key + * that caller didn't detect as the sktrig scan key during its initial + * _bt_check_compare call that used the old/original qual.) + * + * Calls triggered by an unsatisfied nonrequired array scan key never need + * "second pass" handling of required inequalities (nor any other handling + * of any required scan key). All that matters is whether caller's tuple + * satisfies the new qual, so it's safe to just skip the _bt_check_compare + * recheck when we've already determined that it can only return 'false'. + */ + if ((sktrig_required && all_required_satisfied) || + (!sktrig_required && all_satisfied)) + { + int nsktrig = sktrig + 1; + bool continuescan; + + Assert(all_required_satisfied); + + /* Recheck _bt_check_compare on behalf of caller */ + if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, + false, false, false, + &continuescan, &nsktrig) && + !so->scanBehind) + { + /* This tuple satisfies the new qual */ + Assert(all_satisfied && continuescan); + + if (pstate) + pstate->continuescan = true; + + return true; + } + + /* + * Consider "second pass" handling of required inequalities. + * + * It's possible that our _bt_check_compare call indicated that the + * scan should end due to some unsatisfied inequality that wasn't + * initially recognized as such by us. Handle this by calling + * ourselves recursively, this time indicating that the trigger is the + * inequality that we missed first time around (and using a set of + * required array/equality keys that are now exact matches for tuple). + * + * We make a strong, general guarantee that every _bt_checkkeys call + * here will advance the array keys to the maximum possible extent + * that we can know to be safe based on caller's tuple alone. If we + * didn't perform this step, then that guarantee wouldn't quite hold. + */ + if (unlikely(!continuescan)) + { + bool satisfied PG_USED_FOR_ASSERTS_ONLY; + + Assert(sktrig_required); + Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber); + + /* + * The tuple must use "beyond end" advancement during the + * recursive call, so we cannot possibly end up back here when + * recursing. We'll consume a small, fixed amount of stack space. + */ + Assert(!beyond_end_advance); + + /* Advance the array keys a second time using same tuple */ + satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts, + tupdesc, nsktrig, true); + + /* This tuple doesn't satisfy the inequality */ + Assert(!satisfied); + return false; + } + + /* + * Some non-required scan key (from new qual) still not satisfied. + * + * All scan keys required in the current scan direction must still be + * satisfied, though, so we can trust all_required_satisfied below. + */ + } + + /* + * When we were called just to deal with "advancing" non-required arrays, + * this is as far as we can go (cannot stop the scan for these callers) + */ + if (!sktrig_required) + { + /* Caller's tuple doesn't match any qual */ + return false; + } + + /* + * Postcondition array state assertion (for still-unsatisfied tuples). + * + * By here we have established that the scan's required arrays (scan must + * have at least one required array) advanced, without becoming exhausted. + * + * Caller's tuple is now < the newly advanced array keys (or > when this + * is a backwards scan), except in the case where we only got this far due + * to an unsatisfied non-required scan key. Verify that with an assert. + * + * Note: we don't just quit at this point when all required scan keys were + * found to be satisfied because we need to consider edge-cases involving + * scan keys required in the opposite direction only; those aren't tracked + * by all_required_satisfied. (Actually, oppodir_inequality_sktrig trigger + * scan keys are tracked by all_required_satisfied, since it's convenient + * for _bt_check_compare to behave as if they are required in the current + * scan direction to deal with NULLs. We'll account for that separately.) + */ + Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, + false, 0, NULL) == + !all_required_satisfied); + + /* + * We generally permit primitive index scans to continue onto the next + * sibling page when the page's finaltup satisfies all required scan keys + * at the point where we're between pages. + * + * If caller's tuple is also the page's finaltup, and we see that required + * scan keys still aren't satisfied, start a new primitive index scan. + */ + if (!all_required_satisfied && pstate->finaltup == tuple) + goto new_prim_scan; + + /* + * Proactively check finaltup (don't wait until finaltup is reached by the + * scan) when it might well turn out to not be satisfied later on. + * + * Note: if so->scanBehind hasn't already been set for finaltup by us, + * it'll be set during this call to _bt_tuple_before_array_skeys. Either + * way, it'll be set correctly (for the whole page) after this point. + */ + if (!all_required_satisfied && pstate->finaltup && + _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, + BTreeTupleGetNAtts(pstate->finaltup, rel), + false, 0, &so->scanBehind)) + goto new_prim_scan; + + /* + * When we encounter a truncated finaltup high key attribute, we're + * optimistic about the chances of its corresponding required scan key + * being satisfied when we go on to check it against tuples from this + * page's right sibling leaf page. We consider truncated attributes to be + * satisfied by required scan keys, which allows the primitive index scan + * to continue to the next leaf page. We must set so->scanBehind to true + * to remember that the last page's finaltup had "satisfied" required scan + * keys for one or more truncated attribute values (scan keys required in + * _either_ scan direction). + * + * There is a chance that _bt_checkkeys (which checks so->scanBehind) will + * find that even the sibling leaf page's finaltup is < the new array + * keys. When that happens, our optimistic policy will have incurred a + * single extra leaf page access that could have been avoided. + * + * A pessimistic policy would give backward scans a gratuitous advantage + * over forward scans. We'd punish forward scans for applying more + * accurate information from the high key, rather than just using the + * final non-pivot tuple as finaltup, in the style of backward scans. + * Being pessimistic would also give some scans with non-required arrays a + * perverse advantage over similar scans that use required arrays instead. + * + * You can think of this as a speculative bet on what the scan is likely + * to find on the next page. It's not much of a gamble, though, since the + * untruncated prefix of attributes must strictly satisfy the new qual + * (though it's okay if any non-required scan keys fail to be satisfied). + */ + if (so->scanBehind && has_required_opposite_direction_only) + { + /* + * However, we avoid this behavior whenever the scan involves a scan + * key required in the opposite direction to the scan only, along with + * a finaltup with at least one truncated attribute that's associated + * with a scan key marked required (required in either direction). + * + * _bt_check_compare simply won't stop the scan for a scan key that's + * marked required in the opposite scan direction only. That leaves + * us without any reliable way of reconsidering any opposite-direction + * inequalities if it turns out that starting a new primitive index + * scan will allow _bt_first to skip ahead by a great many leaf pages + * (see next section for details of how that works). + */ + goto new_prim_scan; + } + + /* + * Handle inequalities marked required in the opposite scan direction. + * They can also signal that we should start a new primitive index scan. + * + * It's possible that the scan is now positioned where "matching" tuples + * begin, and that caller's tuple satisfies all scan keys required in the + * current scan direction. But if caller's tuple still doesn't satisfy + * other scan keys that are required in the opposite scan direction only + * (e.g., a required >= strategy scan key when scan direction is forward), + * it's still possible that there are many leaf pages before the page that + * _bt_first could skip straight to. Groveling through all those pages + * will always give correct answers, but it can be very inefficient. We + * must avoid needlessly scanning extra pages. + * + * Separately, it's possible that _bt_check_compare set continuescan=false + * for a scan key that's required in the opposite direction only. This is + * a special case, that happens only when _bt_check_compare sees that the + * inequality encountered a NULL value. This signals the end of non-NULL + * values in the current scan direction, which is reason enough to end the + * (primitive) scan. If this happens at the start of a large group of + * NULL values, then we shouldn't expect to be called again until after + * the scan has already read indefinitely-many leaf pages full of tuples + * with NULL suffix values. We need a separate test for this case so that + * we don't miss our only opportunity to skip over such a group of pages. + * (_bt_first is expected to skip over the group of NULLs by applying a + * similar "deduce NOT NULL" rule, where it finishes its insertion scan + * key by consing up an explicit SK_SEARCHNOTNULL key.) + * + * Apply a test against finaltup to detect and recover from these problem: + * if even finaltup doesn't satisfy such an inequality, we just skip by + * starting a new primitive index scan. When we skip, we know for sure + * that all of the tuples on the current page following caller's tuple are + * also before the _bt_first-wise start of tuples for our new qual. That + * at least suggests many more skippable pages beyond the current page. + */ + if (has_required_opposite_direction_only && pstate->finaltup && + (all_required_satisfied || oppodir_inequality_sktrig)) + { + int nfinaltupatts = BTreeTupleGetNAtts(pstate->finaltup, rel); + ScanDirection flipped; + bool continuescanflip; + int opsktrig; + + /* + * We're checking finaltup (which is usually not caller's tuple), so + * cannot reuse work from caller's earlier _bt_check_compare call. + * + * Flip the scan direction when calling _bt_check_compare this time, + * so that it will set continuescanflip=false when it encounters an + * inequality required in the opposite scan direction. + */ + Assert(!so->scanBehind); + opsktrig = 0; + flipped = -dir; + _bt_check_compare(scan, flipped, + pstate->finaltup, nfinaltupatts, tupdesc, + false, false, false, + &continuescanflip, &opsktrig); + + /* + * If we ended up here due to the all_required_satisfied criteria, + * test opsktrig in a way that ensures that finaltup contains the same + * prefix of key columns as caller's tuple (a prefix that satisfies + * earlier required-in-current-direction scan keys). + * + * If we ended up here due to the oppodir_inequality_sktrig criteria, + * test opsktrig in a way that ensures that the same scan key that our + * caller found to be unsatisfied (by the scan's tuple) was also the + * one unsatisfied just now (by finaltup). That way we'll only start + * a new primitive scan when we're sure that both tuples _don't_ share + * the same prefix of satisfied equality-constrained attribute values, + * and that finaltup has a non-NULL attribute value indicated by the + * unsatisfied scan key at offset opsktrig/sktrig. (This depends on + * _bt_check_compare not caring about the direction that inequalities + * are required in whenever NULL attribute values are unsatisfied. It + * only cares about the scan direction, and its relationship to + * whether NULLs are stored first or last relative to non-NULLs.) + */ + Assert(all_required_satisfied != oppodir_inequality_sktrig); + if (unlikely(!continuescanflip && + ((all_required_satisfied && opsktrig > sktrig) || + (oppodir_inequality_sktrig && opsktrig >= sktrig)))) + { + Assert(so->keyData[opsktrig].sk_strategy != BTEqualStrategyNumber); + + /* + * Make sure that any non-required arrays are set to the first + * array element for the current scan direction + */ + _bt_rewind_nonrequired_arrays(scan, dir); + + goto new_prim_scan; + } + } + + /* + * Stick with the ongoing primitive index scan for now. + * + * It's possible that later tuples will also turn out to have values that + * are still < the now-current array keys (or > the current array keys). + * Our caller will handle this by performing what amounts to a linear + * search of the page, implemented by calling _bt_check_compare and then + * _bt_tuple_before_array_skeys for each tuple. + * + * This approach has various advantages over a binary search of the page. + * Repeated binary searches of the page (one binary search for every array + * advancement) won't outperform a continuous linear search. While there + * are workloads that a naive linear search won't handle well, our caller + * has a "look ahead" fallback mechanism to deal with that problem. + */ + pstate->continuescan = true; /* Override _bt_check_compare */ + so->needPrimScan = false; /* _bt_readpage has more tuples to check */ + + if (so->scanBehind) + { + /* Optimization: skip by setting "look ahead" mechanism's offnum */ + Assert(ScanDirectionIsForward(dir)); + pstate->skip = pstate->maxoff + 1; + } + + /* Caller's tuple doesn't match the new qual */ + return false; + +new_prim_scan: + + /* + * End this primitive index scan, but schedule another. + * + * Note: If the scan direction happens to change, this scheduled primitive + * index scan won't go ahead after all. + */ + pstate->continuescan = false; /* Tell _bt_readpage we're done... */ + so->needPrimScan = true; /* ...but call _bt_first again */ + + if (scan->parallel_scan) + _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page); + + /* Caller's tuple doesn't match the new qual */ + return false; + +end_toplevel_scan: + + /* + * End the current primitive index scan, but don't schedule another. + * + * This ends the entire top-level scan in the current scan direction. + * + * Note: The scan's arrays (including any non-required arrays) are now in + * their final positions for the current scan direction. If the scan + * direction happens to change, then the arrays will already be in their + * first positions for what will then be the current scan direction. + */ + pstate->continuescan = false; /* Tell _bt_readpage we're done... */ + so->needPrimScan = false; /* ...don't call _bt_first again, though */ + + /* Caller's tuple doesn't match any qual */ + return false; +} /* * _bt_preprocess_keys() -- Preprocess scan keys * - * The given search-type keys (in scan->keyData[] or so->arrayKeyData[]) + * The given search-type keys (taken from scan->keyData[]) * are copied to so->keyData[] with possible transformation. * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets * the number of output keys (possibly less, never greater). @@ -690,8 +2485,9 @@ _bt_restore_array_keys(IndexScanDesc scan) * The output keys must be sorted by index attribute. Presently we expect * (but verify) that the input keys are already so sorted --- this is done * by match_clauses_to_index() in indxpath.c. Some reordering of the keys - * within each attribute may be done as a byproduct of the processing here, - * but no other code depends on that. + * within each attribute may be done as a byproduct of the processing here. + * That process must leave array scan keys (within an attribute) in the same + * order as corresponding entries from the scan's BTArrayKeyInfo array info. * * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD * if they must be satisfied in order to continue the scan forward or backward @@ -748,8 +2544,8 @@ _bt_restore_array_keys(IndexScanDesc scan) * * Note: the reason we have to copy the preprocessed scan keys into private * storage is that we are modifying the array based on comparisons of the - * key argument values, which could change on a rescan or after moving to - * new elements of array keys. Therefore we can't overwrite the source data. + * key argument values, which could change on a rescan. Therefore we can't + * overwrite the source data. */ void _bt_preprocess_keys(IndexScanDesc scan) @@ -762,11 +2558,31 @@ _bt_preprocess_keys(IndexScanDesc scan) ScanKey inkeys; ScanKey outkeys; ScanKey cur; - ScanKey xform[BTMaxStrategyNumber]; + BTScanKeyPreproc xform[BTMaxStrategyNumber]; bool test_result; int i, j; AttrNumber attno; + ScanKey arrayKeyData; + int *keyDataMap = NULL; + int arrayidx = 0; + + /* + * We're called at the start of each primitive index scan during scans + * that use equality array keys. We can just reuse the scan keys that + * were output at the start of the scan's first primitive index scan. + */ + if (so->numberOfKeys > 0) + { + /* + * An earlier call to _bt_advance_array_keys already set everything up + * already. Just assert that the scan's existing output scan keys are + * consistent with its current array elements. + */ + Assert(so->numArrayKeys); + Assert(_bt_verify_keys_with_arraykeys(scan)); + return; + } /* initialize result variables */ so->qual_ok = true; @@ -775,11 +2591,27 @@ _bt_preprocess_keys(IndexScanDesc scan) if (numberOfKeys < 1) return; /* done if qual-less scan */ + /* If any keys are SK_SEARCHARRAY type, set up array-key info */ + arrayKeyData = _bt_preprocess_array_keys(scan); + if (!so->qual_ok) + { + /* unmatchable array, so give up */ + return; + } + /* - * Read so->arrayKeyData if array keys are present, else scan->keyData + * Treat arrayKeyData[] (a partially preprocessed copy of scan->keyData[]) + * as our input if _bt_preprocess_array_keys just allocated it, else just + * use scan->keyData[] */ - if (so->arrayKeyData != NULL) - inkeys = so->arrayKeyData; + if (arrayKeyData) + { + inkeys = arrayKeyData; + + /* Also maintain keyDataMap for remapping so->orderProc[] later */ + keyDataMap = MemoryContextAlloc(so->arrayContext, + numberOfKeys * sizeof(int)); + } else inkeys = scan->keyData; @@ -800,6 +2632,19 @@ _bt_preprocess_keys(IndexScanDesc scan) /* We can mark the qual as required if it's for first index col */ if (cur->sk_attno == 1) _bt_mark_scankey_required(outkeys); + if (arrayKeyData) + { + /* + * Don't call _bt_preprocess_array_keys_final in this fast path + * (we'll miss out on the single value array transformation, but + * that's not nearly as important when there's only one scan key) + */ + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_strategy != BTEqualStrategyNumber || + (so->arrayKeys[0].scan_key == 0 && + OidIsValid(so->orderProcs[0].fn_oid))); + } + return; } @@ -859,13 +2704,29 @@ _bt_preprocess_keys(IndexScanDesc scan) * check, and we've rejected any combination of it with a regular * equality condition; but not with other types of conditions. */ - if (xform[BTEqualStrategyNumber - 1]) + if (xform[BTEqualStrategyNumber - 1].skey) { - ScanKey eq = xform[BTEqualStrategyNumber - 1]; + ScanKey eq = xform[BTEqualStrategyNumber - 1].skey; + BTArrayKeyInfo *array = NULL; + FmgrInfo *orderproc = NULL; + + if (arrayKeyData && (eq->sk_flags & SK_SEARCHARRAY)) + { + int eq_in_ikey, + eq_arrayidx; + + eq_in_ikey = xform[BTEqualStrategyNumber - 1].ikey; + eq_arrayidx = xform[BTEqualStrategyNumber - 1].arrayidx; + array = &so->arrayKeys[eq_arrayidx - 1]; + orderproc = so->orderProcs + eq_in_ikey; + + Assert(array->scan_key == eq_in_ikey); + Assert(OidIsValid(orderproc->fn_oid)); + } for (j = BTMaxStrategyNumber; --j >= 0;) { - ScanKey chk = xform[j]; + ScanKey chk = xform[j].skey; if (!chk || j == (BTEqualStrategyNumber - 1)) continue; @@ -878,6 +2739,7 @@ _bt_preprocess_keys(IndexScanDesc scan) } if (_bt_compare_scankey_args(scan, chk, eq, chk, + array, orderproc, &test_result)) { if (!test_result) @@ -887,7 +2749,9 @@ _bt_preprocess_keys(IndexScanDesc scan) return; } /* else discard the redundant non-equality key */ - xform[j] = NULL; + Assert(!array || array->num_elems > 0); + xform[j].skey = NULL; + xform[j].ikey = -1; } /* else, cannot determine redundancy, keep both keys */ } @@ -896,36 +2760,36 @@ _bt_preprocess_keys(IndexScanDesc scan) } /* try to keep only one of <, <= */ - if (xform[BTLessStrategyNumber - 1] - && xform[BTLessEqualStrategyNumber - 1]) + if (xform[BTLessStrategyNumber - 1].skey + && xform[BTLessEqualStrategyNumber - 1].skey) { - ScanKey lt = xform[BTLessStrategyNumber - 1]; - ScanKey le = xform[BTLessEqualStrategyNumber - 1]; + ScanKey lt = xform[BTLessStrategyNumber - 1].skey; + ScanKey le = xform[BTLessEqualStrategyNumber - 1].skey; - if (_bt_compare_scankey_args(scan, le, lt, le, + if (_bt_compare_scankey_args(scan, le, lt, le, NULL, NULL, &test_result)) { if (test_result) - xform[BTLessEqualStrategyNumber - 1] = NULL; + xform[BTLessEqualStrategyNumber - 1].skey = NULL; else - xform[BTLessStrategyNumber - 1] = NULL; + xform[BTLessStrategyNumber - 1].skey = NULL; } } /* try to keep only one of >, >= */ - if (xform[BTGreaterStrategyNumber - 1] - && xform[BTGreaterEqualStrategyNumber - 1]) + if (xform[BTGreaterStrategyNumber - 1].skey + && xform[BTGreaterEqualStrategyNumber - 1].skey) { - ScanKey gt = xform[BTGreaterStrategyNumber - 1]; - ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1]; + ScanKey gt = xform[BTGreaterStrategyNumber - 1].skey; + ScanKey ge = xform[BTGreaterEqualStrategyNumber - 1].skey; - if (_bt_compare_scankey_args(scan, ge, gt, ge, + if (_bt_compare_scankey_args(scan, ge, gt, ge, NULL, NULL, &test_result)) { if (test_result) - xform[BTGreaterEqualStrategyNumber - 1] = NULL; + xform[BTGreaterEqualStrategyNumber - 1].skey = NULL; else - xform[BTGreaterStrategyNumber - 1] = NULL; + xform[BTGreaterStrategyNumber - 1].skey = NULL; } } @@ -936,11 +2800,13 @@ _bt_preprocess_keys(IndexScanDesc scan) */ for (j = BTMaxStrategyNumber; --j >= 0;) { - if (xform[j]) + if (xform[j].skey) { ScanKey outkey = &outkeys[new_numberOfKeys++]; - memcpy(outkey, xform[j], sizeof(ScanKeyData)); + memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); + if (arrayKeyData) + keyDataMap[new_numberOfKeys - 1] = xform[j].ikey; if (priorNumberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); } @@ -966,6 +2832,8 @@ _bt_preprocess_keys(IndexScanDesc scan) ScanKey outkey = &outkeys[new_numberOfKeys++]; memcpy(outkey, cur, sizeof(ScanKeyData)); + if (arrayKeyData) + keyDataMap[new_numberOfKeys - 1] = i; if (numberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); @@ -977,20 +2845,112 @@ _bt_preprocess_keys(IndexScanDesc scan) continue; } - /* have we seen one of these before? */ - if (xform[j] == NULL) + /* + * Does this input scan key require further processing as an array? + */ + if (cur->sk_strategy == InvalidStrategy) { - /* nope, so remember this scankey */ - xform[j] = cur; + /* _bt_preprocess_array_keys marked this array key redundant */ + Assert(arrayKeyData); + Assert(cur->sk_flags & SK_SEARCHARRAY); + continue; + } + + if (cur->sk_strategy == BTEqualStrategyNumber && + (cur->sk_flags & SK_SEARCHARRAY)) + { + /* _bt_preprocess_array_keys kept this array key */ + Assert(arrayKeyData); + arrayidx++; + } + + /* + * have we seen a scan key for this same attribute and using this same + * operator strategy before now? + */ + if (xform[j].skey == NULL) + { + /* nope, so this scan key wins by default (at least for now) */ + xform[j].skey = cur; + xform[j].ikey = i; + xform[j].arrayidx = arrayidx; } else { - /* yup, keep only the more restrictive key */ - if (_bt_compare_scankey_args(scan, cur, cur, xform[j], - &test_result)) + FmgrInfo *orderproc = NULL; + BTArrayKeyInfo *array = NULL; + + /* + * Seen one of these before, so keep only the more restrictive key + * if possible + */ + if (j == (BTEqualStrategyNumber - 1) && arrayKeyData) { + /* + * Have to set up array keys + */ + if ((cur->sk_flags & SK_SEARCHARRAY)) + { + array = &so->arrayKeys[arrayidx - 1]; + orderproc = so->orderProcs + i; + + Assert(array->scan_key == i); + Assert(OidIsValid(orderproc->fn_oid)); + } + else if ((xform[j].skey->sk_flags & SK_SEARCHARRAY)) + { + array = &so->arrayKeys[xform[j].arrayidx - 1]; + orderproc = so->orderProcs + xform[j].ikey; + + Assert(array->scan_key == xform[j].ikey); + Assert(OidIsValid(orderproc->fn_oid)); + } + + /* + * Both scan keys might have arrays, in which case we'll + * arbitrarily pass only one of the arrays. That won't + * matter, since _bt_compare_scankey_args is aware that two + * SEARCHARRAY scan keys mean that _bt_preprocess_array_keys + * failed to eliminate redundant arrays through array merging. + * _bt_compare_scankey_args just returns false when it sees + * this; it won't even try to examine either array. + */ + } + + if (_bt_compare_scankey_args(scan, cur, cur, xform[j].skey, + array, orderproc, &test_result)) + { + /* Have all we need to determine redundancy */ if (test_result) - xform[j] = cur; + { + Assert(!array || array->num_elems > 0); + + /* + * New key is more restrictive, and so replaces old key... + */ + if (j != (BTEqualStrategyNumber - 1) || + !(xform[j].skey->sk_flags & SK_SEARCHARRAY)) + { + Assert(!array || array->scan_key == i); + xform[j].skey = cur; + xform[j].ikey = i; + xform[j].arrayidx = arrayidx; + } + else + { + /* + * ...unless we have to keep the old key because it's + * an array that rendered the new key redundant. We + * need to make sure that we don't throw away an array + * scan key. _bt_compare_scankey_args expects us to + * always keep arrays (and discard non-arrays). + */ + Assert(j == (BTEqualStrategyNumber - 1)); + Assert(xform[j].skey->sk_flags & SK_SEARCHARRAY); + Assert(xform[j].ikey == array->scan_key); + Assert(!(cur->sk_flags & SK_SEARCHARRAY)); + } + } else if (j == (BTEqualStrategyNumber - 1)) { /* key == a && key == b, but a != b */ @@ -1002,22 +2962,130 @@ _bt_preprocess_keys(IndexScanDesc scan) else { /* - * We can't determine which key is more restrictive. Keep the - * previous one in xform[j] and push this one directly to the - * output array. + * We can't determine which key is more restrictive. Push + * xform[j] directly to the output array, then set xform[j] to + * the new scan key. + * + * Note: We do things this way around so that our arrays are + * always in the same order as their corresponding scan keys, + * even with incomplete opfamilies. _bt_advance_array_keys + * depends on this. */ ScanKey outkey = &outkeys[new_numberOfKeys++]; - memcpy(outkey, cur, sizeof(ScanKeyData)); + memcpy(outkey, xform[j].skey, sizeof(ScanKeyData)); + if (arrayKeyData) + keyDataMap[new_numberOfKeys - 1] = xform[j].ikey; if (numberOfEqualCols == attno - 1) _bt_mark_scankey_required(outkey); + xform[j].skey = cur; + xform[j].ikey = i; + xform[j].arrayidx = arrayidx; } } } so->numberOfKeys = new_numberOfKeys; + + /* + * Now that we've built a temporary mapping from so->keyData[] (output + * scan keys) to scan->keyData[] (input scan keys), fix array->scan_key + * references. Also consolidate the so->orderProc[] array such that it + * can be subscripted using so->keyData[]-wise offsets. + */ + if (arrayKeyData) + _bt_preprocess_array_keys_final(scan, keyDataMap); + + /* Could pfree arrayKeyData/keyDataMap now, but not worth the cycles */ } +#ifdef USE_ASSERT_CHECKING +/* + * Verify that the scan's qual state matches what we expect at the point that + * _bt_start_prim_scan is about to start a just-scheduled new primitive scan. + * + * We enforce a rule against non-required array scan keys: they must start out + * with whatever element is the first for the scan's current scan direction. + * See _bt_rewind_nonrequired_arrays comments for an explanation. + */ +static bool +_bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int arrayidx = 0; + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array = NULL; + int first_elem_dir; + + if (!(cur->sk_flags & SK_SEARCHARRAY) || + cur->sk_strategy != BTEqualStrategyNumber) + continue; + + array = &so->arrayKeys[arrayidx++]; + + if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || + ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) + continue; + + if (ScanDirectionIsForward(dir)) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array->cur_elem != first_elem_dir) + return false; + } + + return _bt_verify_keys_with_arraykeys(scan); +} + +/* + * Verify that the scan's "so->keyData[]" scan keys are in agreement with + * its array key state + */ +static bool +_bt_verify_keys_with_arraykeys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + int last_sk_attno = InvalidAttrNumber, + arrayidx = 0; + + if (!so->qual_ok) + return false; + + for (int ikey = 0; ikey < so->numberOfKeys; ikey++) + { + ScanKey cur = so->keyData + ikey; + BTArrayKeyInfo *array; + + if (cur->sk_strategy != BTEqualStrategyNumber || + !(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + array = &so->arrayKeys[arrayidx++]; + if (array->scan_key != ikey) + return false; + + if (array->num_elems <= 0) + return false; + + if (cur->sk_argument != array->elem_values[array->cur_elem]) + return false; + if (last_sk_attno > cur->sk_attno) + return false; + last_sk_attno = cur->sk_attno; + } + + if (arrayidx != so->numArrayKeys) + return false; + + return true; +} +#endif + /* * Compare two scankey values using a specified operator. * @@ -1033,9 +3101,24 @@ _bt_preprocess_keys(IndexScanDesc scan) * we store the operator result in *result and return true. We return false * if the comparison could not be made. * + * If either leftarg or rightarg are an array, we'll apply array-specific + * rules to determine which array elements are redundant on behalf of caller. + * It is up to our caller to save whichever of the two scan keys is the array, + * and discard the non-array scan key (the non-array scan key is guaranteed to + * be redundant with any complete opfamily). Caller isn't expected to call + * here with a pair of array scan keys provided we're dealing with a complete + * opfamily (_bt_preprocess_array_keys will merge array keys together to make + * sure of that). + * + * Note: we'll also shrink caller's array as needed to eliminate redundant + * array elements. One reason why caller should prefer to discard non-array + * scan keys is so that we'll have the opportunity to shrink the array + * multiple times, in multiple calls (for each of several other scan keys on + * the same index attribute). + * * Note: op always points at the same ScanKey as either leftarg or rightarg. - * Since we don't scribble on the scankeys, this aliasing should cause no - * trouble. + * Since we don't scribble on the scankeys themselves, this aliasing should + * cause no trouble. * * Note: this routine needs to be insensitive to any DESC option applied * to the index column. For example, "x < 4" is a tighter constraint than @@ -1044,6 +3127,7 @@ _bt_preprocess_keys(IndexScanDesc scan) static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, + BTArrayKeyInfo *array, FmgrInfo *orderproc, bool *result) { Relation rel = scan->indexRelation; @@ -1112,6 +3196,48 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, return true; } + /* + * If either leftarg or rightarg are equality-type array scankeys, we need + * specialized handling (since by now we know that IS NULL wasn't used) + */ + if (array) + { + bool leftarray, + rightarray; + + leftarray = ((leftarg->sk_flags & SK_SEARCHARRAY) && + leftarg->sk_strategy == BTEqualStrategyNumber); + rightarray = ((rightarg->sk_flags & SK_SEARCHARRAY) && + rightarg->sk_strategy == BTEqualStrategyNumber); + + /* + * _bt_preprocess_array_keys is responsible for merging together array + * scan keys, and will do so whenever the opfamily has the required + * cross-type support. If it failed to do that, we handle it just + * like the case where we can't make the comparison ourselves. + */ + if (leftarray && rightarray) + { + /* Can't make the comparison */ + *result = false; /* suppress compiler warnings */ + return false; + } + + /* + * Otherwise we need to determine if either one of leftarg or rightarg + * uses an array, then pass this through to a dedicated helper + * function. + */ + if (leftarray) + return _bt_compare_array_scankey_args(scan, leftarg, rightarg, + orderproc, array, result); + else if (rightarray) + return _bt_compare_array_scankey_args(scan, rightarg, leftarg, + orderproc, array, result); + + /* FALL THRU */ + } + /* * The opfamily we need to worry about is identified by the index column. */ @@ -1351,60 +3477,234 @@ _bt_mark_scankey_required(ScanKey skey) * * Return true if so, false if not. If the tuple fails to pass the qual, * we also determine whether there's any need to continue the scan beyond - * this tuple, and set *continuescan accordingly. See comments for + * this tuple, and set pstate.continuescan accordingly. See comments for * _bt_preprocess_keys(), above, about how this is done. * * Forward scan callers can pass a high key tuple in the hopes of having * us set *continuescan to false, and avoiding an unnecessary visit to * the page to the right. * + * Advances the scan's array keys when necessary for arrayKeys=true callers. + * Caller can avoid all array related side-effects when calling just to do a + * page continuescan precheck -- pass arrayKeys=false for that. Scans without + * any arrays keys must always pass arrayKeys=false. + * + * Also stops and starts primitive index scans for arrayKeys=true callers. + * Scans with array keys are required to set up page state that helps us with + * this. The page's finaltup tuple (the page high key for a forward scan, or + * the page's first non-pivot tuple for a backward scan) must be set in + * pstate.finaltup ahead of the first call here for the page (or possibly the + * first call after an initial continuescan-setting page precheck call). Set + * this to NULL for rightmost page (or the leftmost page for backwards scans). + * * scan: index scan descriptor (containing a search-type scankey) + * pstate: page level input and output parameters + * arrayKeys: should we advance the scan's array keys if necessary? * tuple: index tuple to test * tupnatts: number of attributes in tupnatts (high key may be truncated) - * dir: direction we are scanning in - * continuescan: output parameter (will be set correctly in all cases) - * continuescanPrechecked: indicates that *continuescan flag is known to - * be true for the last item on the page - * haveFirstMatch: indicates that we already have at least one match - * in the current page */ bool -_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, - ScanDirection dir, bool *continuescan, - bool continuescanPrechecked, bool haveFirstMatch) +_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, + IndexTuple tuple, int tupnatts) { - TupleDesc tupdesc; - BTScanOpaque so; - int keysz; - int ikey; - ScanKey key; + TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanDirection dir = pstate->dir; + int ikey = 0; + bool res; Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); - *continuescan = true; /* default assumption */ + res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, + arrayKeys, pstate->prechecked, pstate->firstmatch, + &pstate->continuescan, &ikey); - tupdesc = RelationGetDescr(scan->indexRelation); - so = (BTScanOpaque) scan->opaque; - keysz = so->numberOfKeys; - - for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) +#ifdef USE_ASSERT_CHECKING + if (!arrayKeys && so->numArrayKeys) { - Datum datum; - bool isNull; - Datum test; - bool requiredSameDir = false, - requiredOppositeDir = false; + /* + * This is a continuescan precheck call for a scan with array keys. + * + * Assert that the scan isn't in danger of becoming confused. + */ + Assert(!so->scanBehind && !pstate->prechecked && !pstate->firstmatch); + Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, + tupnatts, false, 0, NULL)); + } + if (pstate->prechecked || pstate->firstmatch) + { + bool dcontinuescan; + int dikey = 0; /* - * Check if the key is required for ordered scan in the same or - * opposite direction. Save as flag variables for future usage. + * Call relied on continuescan/firstmatch prechecks -- assert that we + * get the same answer without those optimizations + */ + Assert(res == _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, + false, false, false, + &dcontinuescan, &dikey)); + Assert(pstate->continuescan == dcontinuescan); + } +#endif + + /* + * Only one _bt_check_compare call is required in the common case where + * there are no equality strategy array scan keys. Otherwise we can only + * accept _bt_check_compare's answer unreservedly when it didn't set + * pstate.continuescan=false. + */ + if (!arrayKeys || pstate->continuescan) + return res; + + /* + * _bt_check_compare call set continuescan=false in the presence of + * equality type array keys. This could mean that the tuple is just past + * the end of matches for the current array keys. + * + * It's also possible that the scan is still _before_ the _start_ of + * tuples matching the current set of array keys. Check for that first. + */ + if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, + ikey, NULL)) + { + /* + * Tuple is still before the start of matches according to the scan's + * required array keys (according to _all_ of its required equality + * strategy keys, actually). + * + * _bt_advance_array_keys occasionally sets so->scanBehind to signal + * that the scan's current position/tuples might be significantly + * behind (multiple pages behind) its current array keys. When this + * happens, we need to be prepared to recover by starting a new + * primitive index scan here, on our own. + */ + Assert(!so->scanBehind || + so->keyData[ikey].sk_strategy == BTEqualStrategyNumber); + if (unlikely(so->scanBehind) && pstate->finaltup && + _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, + BTreeTupleGetNAtts(pstate->finaltup, + scan->indexRelation), + false, 0, NULL)) + { + /* Cut our losses -- start a new primitive index scan now */ + pstate->continuescan = false; + so->needPrimScan = true; + } + else + { + /* Override _bt_check_compare, continue primitive scan */ + pstate->continuescan = true; + + /* + * We will end up here repeatedly given a group of tuples > the + * previous array keys and < the now-current keys (for a backwards + * scan it's just the same, though the operators swap positions). + * + * We must avoid allowing this linear search process to scan very + * many tuples from well before the start of tuples matching the + * current array keys (or from well before the point where we'll + * once again have to advance the scan's array keys). + * + * We keep the overhead under control by speculatively "looking + * ahead" to later still-unscanned items from this same leaf page. + * We'll only attempt this once the number of tuples that the + * linear search process has examined starts to get out of hand. + */ + pstate->rechecks++; + if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS) + { + /* See if we should skip ahead within the current leaf page */ + _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc); + + /* + * Might have set pstate.skip to a later page offset. When + * that happens then _bt_readpage caller will inexpensively + * skip ahead to a later tuple from the same page (the one + * just after the tuple we successfully "looked ahead" to). + */ + } + } + + /* This indextuple doesn't match the current qual, in any case */ + return false; + } + + /* + * Caller's tuple is >= the current set of array keys and other equality + * constraint scan keys (or <= if this is a backwards scan). It's now + * clear that we _must_ advance any required array keys in lockstep with + * the scan. + */ + return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc, + ikey, true); +} + +/* + * Test whether an indextuple satisfies current scan condition. + * + * Return true if so, false if not. If not, also sets *continuescan to false + * when it's also not possible for any later tuples to pass the current qual + * (with the scan's current set of array keys, in the current scan direction), + * in addition to setting *ikey to the so->keyData[] subscript/offset for the + * unsatisfied scan key (needed when caller must consider advancing the scan's + * array keys). + * + * This is a subroutine for _bt_checkkeys. We provisionally assume that + * reaching the end of the current set of required keys (in particular the + * current required array keys) ends the ongoing (primitive) index scan. + * Callers without array keys should just end the scan right away when they + * find that continuescan has been set to false here by us. Things are more + * complicated for callers with array keys. + * + * Callers with array keys must first consider advancing the arrays when + * continuescan has been set to false here by us. They must then consider if + * it really does make sense to end the current (primitive) index scan, in + * light of everything that is known at that point. (In general when we set + * continuescan=false for these callers it must be treated as provisional.) + * + * We deal with advancing unsatisfied non-required arrays directly, though. + * This is safe, since by definition non-required keys can't end the scan. + * This is just how we determine if non-required arrays are just unsatisfied + * by the current array key, or if they're truly unsatisfied (that is, if + * they're unsatisfied by every possible array key). + * + * Though we advance non-required array keys on our own, that shouldn't have + * any lasting consequences for the scan. By definition, non-required arrays + * have no fixed relationship with the scan's progress. (There are delicate + * considerations for non-required arrays when the arrays need to be advanced + * following our setting continuescan to false, but that doesn't concern us.) + * + * Pass advancenonrequired=false to avoid all array related side effects. + * This allows _bt_advance_array_keys caller to avoid infinite recursion. + */ +static bool +_bt_check_compare(IndexScanDesc scan, ScanDirection dir, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool advancenonrequired, bool prechecked, bool firstmatch, + bool *continuescan, int *ikey) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + *continuescan = true; /* default assumption */ + + for (; *ikey < so->numberOfKeys; (*ikey)++) + { + ScanKey key = so->keyData + *ikey; + Datum datum; + bool isNull; + bool requiredSameDir = false, + requiredOppositeDirOnly = false; + + /* + * Check if the key is required in the current scan direction, in the + * opposite scan direction _only_, or in neither direction */ if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) requiredSameDir = true; else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) - requiredOppositeDir = true; + requiredOppositeDirOnly = true; /* * If the caller told us the *continuescan flag is known to be true @@ -1422,8 +3722,9 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * Both cases above work except for the row keys, where NULLs could be * found in the middle of matching values. */ - if ((requiredSameDir || (requiredOppositeDir && haveFirstMatch)) && - !(key->sk_flags & SK_ROW_HEADER) && continuescanPrechecked) + if (prechecked && + (requiredSameDir || (requiredOppositeDirOnly && firstmatch)) && + !(key->sk_flags & SK_ROW_HEADER)) continue; if (key->sk_attno > tupnatts) @@ -1434,7 +3735,6 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * right could be any possible value. Assume that truncated * attribute passes the qual. */ - Assert(ScanDirectionIsForward(dir)); Assert(BTreeTupleIsPivot(tuple)); continue; } @@ -1495,6 +3795,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a forward scan, however, we must keep going, because we may * have initially positioned to the start of the index. + * (_bt_advance_array_keys also relies on this behavior during + * forward scans.) */ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsBackward(dir)) @@ -1511,6 +3813,8 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a backward scan, however, we must keep going, because we * may have initially positioned to the end of the index. + * (_bt_advance_array_keys also relies on this behavior during + * backward scans.) */ if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsForward(dir)) @@ -1524,24 +3828,15 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, } /* - * Apply the key-checking function. When the key is required for the - * opposite direction scan, it must be already satisfied as soon as - * there is already match on the page. Except for the NULLs checking, - * which have already done above. + * Apply the key-checking function, though only if we must. + * + * When a key is required in the opposite-of-scan direction _only_, + * then it must already be satisfied if firstmatch=true indicates that + * an earlier tuple from this same page satisfied it earlier on. */ - if (!(requiredOppositeDir && haveFirstMatch)) - { - test = FunctionCall2Coll(&key->sk_func, key->sk_collation, - datum, key->sk_argument); - } - else - { - test = true; - Assert(test == FunctionCall2Coll(&key->sk_func, key->sk_collation, - datum, key->sk_argument)); - } - - if (!DatumGetBool(test)) + if (!(requiredOppositeDirOnly && firstmatch) && + !DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument))) { /* * Tuple fails this qual. If it's a required qual for the current @@ -1557,7 +3852,19 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, *continuescan = false; /* - * In any case, this indextuple doesn't match the qual. + * If this is a non-required equality-type array key, the tuple + * needs to be checked against every possible array key. Handle + * this by "advancing" the scan key's array to a matching value + * (if we're successful then the tuple might match the qual). + */ + else if (advancenonrequired && + key->sk_strategy == BTEqualStrategyNumber && + (key->sk_flags & SK_SEARCHARRAY)) + return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, + tupdesc, *ikey, false); + + /* + * This indextuple doesn't match the qual. */ return false; } @@ -1574,7 +3881,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * it's not possible for any future tuples in the current scan direction * to pass the qual. * - * This is a subroutine for _bt_checkkeys, which see for more info. + * This is a subroutine for _bt_checkkeys/_bt_check_compare. */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, @@ -1603,7 +3910,6 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * right could be any possible value. Assume that truncated * attribute passes the qual. */ - Assert(ScanDirectionIsForward(dir)); Assert(BTreeTupleIsPivot(tuple)); cmpresult = 0; if (subkey->sk_flags & SK_ROW_END) @@ -1630,6 +3936,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a forward scan, however, we must keep going, because we may * have initially positioned to the start of the index. + * (_bt_advance_array_keys also relies on this behavior during + * forward scans.) */ if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsBackward(dir)) @@ -1646,6 +3954,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, * because it's not possible for any future tuples to pass. On * a backward scan, however, we must keep going, because we * may have initially positioned to the end of the index. + * (_bt_advance_array_keys also relies on this behavior during + * backward scans.) */ if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsForward(dir)) @@ -1741,6 +4051,90 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, return result; } +/* + * Determine if a scan with array keys should skip over uninteresting tuples. + * + * This is a subroutine for _bt_checkkeys. Called when _bt_readpage's linear + * search process (started after it finishes reading an initial group of + * matching tuples, used to locate the start of the next group of tuples + * matching the next set of required array keys) has already scanned an + * excessive number of tuples whose key space is "between arrays". + * + * When we perform look ahead successfully, we'll sets pstate.skip, which + * instructs _bt_readpage to skip ahead to that tuple next (could be past the + * end of the scan's leaf page). Pages where the optimization is effective + * will generally still need to skip several times. Each call here performs + * only a single "look ahead" comparison of a later tuple, whose distance from + * the current tuple's offset number is determined by applying heuristics. + */ +static void +_bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, + int tupnatts, TupleDesc tupdesc) +{ + ScanDirection dir = pstate->dir; + OffsetNumber aheadoffnum; + IndexTuple ahead; + + /* Avoid looking ahead when comparing the page high key */ + if (pstate->offnum < pstate->minoff) + return; + + /* + * Don't look ahead when there aren't enough tuples remaining on the page + * (in the current scan direction) for it to be worth our while + */ + if (ScanDirectionIsForward(dir) && + pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE) + return; + else if (ScanDirectionIsBackward(dir) && + pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE) + return; + + /* + * The look ahead distance starts small, and ramps up as each call here + * allows _bt_readpage to skip over more tuples + */ + if (!pstate->targetdistance) + pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE; + else + pstate->targetdistance *= 2; + + /* Don't read past the end (or before the start) of the page, though */ + if (ScanDirectionIsForward(dir)) + aheadoffnum = Min((int) pstate->maxoff, + (int) pstate->offnum + pstate->targetdistance); + else + aheadoffnum = Max((int) pstate->minoff, + (int) pstate->offnum - pstate->targetdistance); + + ahead = (IndexTuple) PageGetItem(pstate->page, + PageGetItemId(pstate->page, aheadoffnum)); + if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts, + false, 0, NULL)) + { + /* + * Success -- instruct _bt_readpage to skip ahead to very next tuple + * after the one we determined was still before the current array keys + */ + if (ScanDirectionIsForward(dir)) + pstate->skip = aheadoffnum + 1; + else + pstate->skip = aheadoffnum - 1; + } + else + { + /* + * Failure -- "ahead" tuple is too far ahead (we were too aggresive). + * + * Reset the number of rechecks, and aggressively reduce the target + * distance (we're much more aggressive here than we were when the + * distance was initially ramped up). + */ + pstate->rechecks = 0; + pstate->targetdistance = Max(pstate->targetdistance / 8, 1); + } +} + /* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 9e35aaf56e..fcf6d1d932 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -628,6 +628,8 @@ ExecIndexOnlyScanEstimate(IndexOnlyScanState *node, EState *estate = node->ss.ps.state; node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys, estate->es_snapshot); shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen); shm_toc_estimate_keys(&pcxt->estimator, 1); diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 2a3264599d..8000feff4c 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -1644,6 +1644,8 @@ ExecIndexScanEstimate(IndexScanState *node, EState *estate = node->ss.ps.state; node->iss_PscanLen = index_parallelscan_estimate(node->iss_RelationDesc, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, estate->es_snapshot); shm_toc_estimate_chunk(&pcxt->estimator, node->iss_PscanLen); shm_toc_estimate_keys(&pcxt->estimator, 1); diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 32c6a8bbdc..2230b13104 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -106,8 +106,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop); + bool *skip_nonnative_saop); static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, List *clauses, List *other_clauses); static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, @@ -706,8 +705,6 @@ eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids, * index AM supports them natively, we should just include them in simple * index paths. If not, we should exclude them while building simple index * paths, and then make a separate attempt to include them in bitmap paths. - * Furthermore, we should consider excluding lower-order ScalarArrayOpExpr - * quals so as to create ordered paths. */ static void get_index_paths(PlannerInfo *root, RelOptInfo *rel, @@ -716,37 +713,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { List *indexpaths; bool skip_nonnative_saop = false; - bool skip_lower_saop = false; ListCell *lc; /* * Build simple index paths using the clauses. Allow ScalarArrayOpExpr - * clauses only if the index AM supports them natively, and skip any such - * clauses for index columns after the first (so that we produce ordered - * paths if possible). + * clauses only if the index AM supports them natively. */ indexpaths = build_index_paths(root, rel, index, clauses, index->predOK, ST_ANYSCAN, - &skip_nonnative_saop, - &skip_lower_saop); - - /* - * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM - * that supports them, then try again including those clauses. This will - * produce paths with more selectivity but no ordering. - */ - if (skip_lower_saop) - { - indexpaths = list_concat(indexpaths, - build_index_paths(root, rel, - index, clauses, - index->predOK, - ST_ANYSCAN, - &skip_nonnative_saop, - NULL)); - } + &skip_nonnative_saop); /* * Submit all the ones that can form plain IndexScan plans to add_path. (A @@ -784,7 +761,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, false, ST_BITMAPSCAN, - NULL, NULL); *bitindexpaths = list_concat(*bitindexpaths, indexpaths); } @@ -817,27 +793,19 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, * to true if we found any such clauses (caller must initialize the variable * to false). If it's NULL, we do not ignore ScalarArrayOpExpr clauses. * - * If skip_lower_saop is non-NULL, we ignore ScalarArrayOpExpr clauses for - * non-first index columns, and we set *skip_lower_saop to true if we found - * any such clauses (caller must initialize the variable to false). If it's - * NULL, we do not ignore non-first ScalarArrayOpExpr clauses, but they will - * result in considering the scan's output to be unordered. - * * 'rel' is the index's heap relation * 'index' is the index for which we want to generate paths * 'clauses' is the collection of indexable clauses (IndexClause nodes) * 'useful_predicate' indicates whether the index has a useful predicate * 'scantype' indicates whether we need plain or bitmap scan support * 'skip_nonnative_saop' indicates whether to accept SAOP if index AM doesn't - * 'skip_lower_saop' indicates whether to accept non-first-column SAOP */ static List * build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop) + bool *skip_nonnative_saop) { List *result = NIL; IndexPath *ipath; @@ -848,12 +816,13 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, List *orderbyclausecols; List *index_pathkeys; List *useful_pathkeys; - bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; int indexcol; + Assert(skip_nonnative_saop != NULL || scantype == ST_BITMAPSCAN); + /* * Check that index supports the desired scan type(s) */ @@ -880,19 +849,11 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * on by btree and possibly other places.) The list can be empty, if the * index AM allows that. * - * found_lower_saop_clause is set true if we accept a ScalarArrayOpExpr - * index clause for a non-first index column. This prevents us from - * assuming that the scan result is ordered. (Actually, the result is - * still ordered if there are equality constraints for all earlier - * columns, but it seems too expensive and non-modular for this code to be - * aware of that refinement.) - * * We also build a Relids set showing which outer rels are required by the * selected clauses. Any lateral_relids are included in that, but not * otherwise accounted for. */ index_clauses = NIL; - found_lower_saop_clause = false; outer_relids = bms_copy(rel->lateral_relids); for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++) { @@ -903,30 +864,18 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexClause *iclause = (IndexClause *) lfirst(lc); RestrictInfo *rinfo = iclause->rinfo; - /* We might need to omit ScalarArrayOpExpr clauses */ - if (IsA(rinfo->clause, ScalarArrayOpExpr)) + if (skip_nonnative_saop && !index->amsearcharray && + IsA(rinfo->clause, ScalarArrayOpExpr)) { - if (!index->amsearcharray) - { - if (skip_nonnative_saop) - { - /* Ignore because not supported by index */ - *skip_nonnative_saop = true; - continue; - } - /* Caller had better intend this only for bitmap scan */ - Assert(scantype == ST_BITMAPSCAN); - } - if (indexcol > 0) - { - if (skip_lower_saop) - { - /* Caller doesn't want to lose index ordering */ - *skip_lower_saop = true; - continue; - } - found_lower_saop_clause = true; - } + /* + * Caller asked us to generate IndexPaths that omit any + * ScalarArrayOpExpr clauses when the underlying index AM + * lacks native support. + * + * We must omit this clause (and tell caller about it). + */ + *skip_nonnative_saop = true; + continue; } /* OK to include this clause */ @@ -956,11 +905,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, /* * 2. Compute pathkeys describing index's ordering, if any, then see how * many of them are actually useful for this query. This is not relevant - * if we are only trying to build bitmap indexscans, nor if we have to - * assume the scan is unordered. + * if we are only trying to build bitmap indexscans. */ pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN && - !found_lower_saop_clause && has_useful_pathkeys(root, rel)); index_is_ordered = (index->sortopfamily != NULL); if (index_is_ordered && pathkeys_possibly_useful) @@ -1212,7 +1159,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, index, &clauseset, useful_predicate, ST_BITMAPSCAN, - NULL, NULL); result = list_concat(result, indexpaths); } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index cea777e9d4..35f8f306ee 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6572,21 +6572,26 @@ genericcostestimate(PlannerInfo *root, selectivityQuals = add_predicate_to_index_quals(index, indexQuals); /* - * Check for ScalarArrayOpExpr index quals, and estimate the number of - * index scans that will be performed. + * If caller didn't give us an estimate for ScalarArrayOpExpr index scans, + * just assume that the number of index descents is the number of distinct + * combinations of array elements from all of the scan's SAOP clauses. */ - num_sa_scans = 1; - foreach(l, indexQuals) + num_sa_scans = costs->num_sa_scans; + if (num_sa_scans < 1) { - RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); - - if (IsA(rinfo->clause, ScalarArrayOpExpr)) + num_sa_scans = 1; + foreach(l, indexQuals) { - ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; - double alength = estimate_array_length(root, lsecond(saop->args)); + RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); - if (alength > 1) - num_sa_scans *= alength; + if (IsA(rinfo->clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; + double alength = estimate_array_length(root, lsecond(saop->args)); + + if (alength > 1) + num_sa_scans *= alength; + } } } @@ -6813,9 +6818,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * For a RowCompareExpr, we consider only the first column, just as * rowcomparesel() does. * - * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N - * index scans not one, but the ScalarArrayOpExpr's operator can be - * considered to act the same as it normally does. + * If there's a ScalarArrayOpExpr in the quals, we'll actually perform up + * to N index descents (not just one), but the ScalarArrayOpExpr's + * operator can be considered to act the same as it normally does. */ indexBoundQuals = NIL; indexcol = 0; @@ -6867,7 +6872,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, clause_op = saop->opno; found_saop = true; - /* count number of SA scans induced by indexBoundQuals only */ + /* estimate SA descents by indexBoundQuals only */ if (alength > 1) num_sa_scans *= alength; } @@ -6930,10 +6935,48 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, NULL); numIndexTuples = btreeSelectivity * index->rel->tuples; + /* + * btree automatically combines individual ScalarArrayOpExpr primitive + * index scans whenever the tuples covered by the next set of array + * keys are close to tuples covered by the current set. That puts a + * natural ceiling on the worst case number of descents -- there + * cannot possibly be more than one descent per leaf page scanned. + * + * Clamp the number of descents to at most 1/3 the number of index + * pages. This avoids implausibly high estimates with low selectivity + * paths, where scans usually require only one or two descents. This + * is most likely to help when there are several SAOP clauses, where + * naively accepting the total number of distinct combinations of + * array elements as the number of descents would frequently lead to + * wild overestimates. + * + * We somewhat arbitrarily don't just make the cutoff the total number + * of leaf pages (we make it 1/3 the total number of pages instead) to + * give the btree code credit for its ability to continue on the leaf + * level with low selectivity scans. + */ + num_sa_scans = Min(num_sa_scans, ceil(index->pages * 0.3333333)); + num_sa_scans = Max(num_sa_scans, 1); + /* * As in genericcostestimate(), we have to adjust for any * ScalarArrayOpExpr quals included in indexBoundQuals, and then round * to integer. + * + * It is tempting to make genericcostestimate behave as if SAOP + * clauses work in almost the same way as scalar operators during + * btree scans, making the top-level scan look like a continuous scan + * (as opposed to num_sa_scans-many primitive index scans). After + * all, btree scans mostly work like that at runtime. However, such a + * scheme would badly bias genericcostestimate's simplistic appraoch + * to calculating numIndexPages through prorating. + * + * Stick with the approach taken by non-native SAOP scans for now. + * genericcostestimate will use the Mackert-Lohman formula to + * compensate for repeat page fetches, even though that definitely + * won't happen during btree scans (not for leaf pages, at least). + * We're usually very pessimistic about the number of primitive index + * scans that will be required, but it's not clear how to do better. */ numIndexTuples = rint(numIndexTuples / num_sa_scans); } @@ -6942,6 +6985,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * Now do generic index cost estimation. */ costs.numIndexTuples = numIndexTuples; + costs.num_sa_scans = num_sa_scans; genericcostestimate(root, path, loop_count, &costs); @@ -6952,9 +6996,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * comparisons to descend a btree of N leaf tuples. We charge one * cpu_operator_cost per comparison. * - * If there are ScalarArrayOpExprs, charge this once per SA scan. The - * ones after the first one are not startup cost so far as the overall - * plan is concerned, so add them only to "total" cost. + * If there are ScalarArrayOpExprs, charge this once per estimated SA + * index descent. The ones after the first one are not startup cost so + * far as the overall plan goes, so just add them to "total" cost. */ if (index->tuples > 1) /* avoid computing log(0) */ { @@ -6971,7 +7015,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * in cases where only a single leaf page is expected to be visited. This * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page * touched. The number of such pages is btree tree height plus one (ie, - * we charge for the leaf page too). As above, charge once per SA scan. + * we charge for the leaf page too). As above, charge once per estimated + * SA index descent. */ descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost; costs.indexStartupCost += descentCost; diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 2c6c307efc..00300dd720 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -194,7 +194,7 @@ typedef void (*amrestrpos_function) (IndexScanDesc scan); */ /* estimate size of parallel scan descriptor */ -typedef Size (*amestimateparallelscan_function) (void); +typedef Size (*amestimateparallelscan_function) (int nkeys, int norderbys); /* prepare for parallel index scan */ typedef void (*aminitparallelscan_function) (void *target); diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 8026c2b36d..fdcfbe8db7 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -165,7 +165,8 @@ extern void index_rescan(IndexScanDesc scan, extern void index_endscan(IndexScanDesc scan); extern void index_markpos(IndexScanDesc scan); extern void index_restrpos(IndexScanDesc scan); -extern Size index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot); +extern Size index_parallelscan_estimate(Relation indexRelation, + int nkeys, int norderbys, Snapshot snapshot); extern void index_parallelscan_initialize(Relation heapRelation, Relation indexRelation, Snapshot snapshot, ParallelIndexScanDesc target); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6eb162052e..b9053219a6 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -960,11 +960,20 @@ typedef struct BTScanPosData * moreLeft and moreRight track whether we think there may be matching * index entries to the left and right of the current page, respectively. * We can clear the appropriate one of these flags when _bt_checkkeys() - * returns continuescan = false. + * sets BTReadPageState.continuescan = false. */ bool moreLeft; bool moreRight; + /* + * Direction of the scan at the time that _bt_readpage was called. + * + * Used by btrestrpos to "restore" the scan's array keys by resetting each + * array to its first element's value (first in this scan direction). This + * avoids the need to directly track the array keys in btmarkpos. + */ + ScanDirection dir; + /* * If we are doing an index-only scan, nextTupleOffset is the first free * location in the associated tuple storage workspace. @@ -1022,9 +1031,8 @@ typedef BTScanPosData *BTScanPos; /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ typedef struct BTArrayKeyInfo { - int scan_key; /* index of associated key in arrayKeyData */ + int scan_key; /* index of associated key in keyData */ int cur_elem; /* index of current element in elem_values */ - int mark_elem; /* index of marked element in elem_values */ int num_elems; /* number of elems in current array value */ Datum *elem_values; /* array of num_elems Datums */ } BTArrayKeyInfo; @@ -1037,14 +1045,11 @@ typedef struct BTScanOpaqueData ScanKey keyData; /* array of preprocessed scan keys */ /* workspace for SK_SEARCHARRAY support */ - ScanKey arrayKeyData; /* modified copy of scan->keyData */ - bool arraysStarted; /* Started array keys, but have yet to "reach - * past the end" of all arrays? */ - int numArrayKeys; /* number of equality-type array keys (-1 if - * there are any unsatisfiable array keys) */ - int arrayKeyCount; /* count indicating number of array scan keys - * processed */ + int numArrayKeys; /* number of equality-type array keys */ + bool needPrimScan; /* New prim scan to continue in current dir? */ + bool scanBehind; /* Last array advancement matched -inf attr? */ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ MemoryContext arrayContext; /* scan-lifespan context for array data */ /* info about killed items if any (killedItems is NULL if never used) */ @@ -1075,6 +1080,42 @@ typedef struct BTScanOpaqueData typedef BTScanOpaqueData *BTScanOpaque; +/* + * _bt_readpage state used across _bt_checkkeys calls for a page + */ +typedef struct BTReadPageState +{ + /* Input parameters, set by _bt_readpage for _bt_checkkeys */ + ScanDirection dir; /* current scan direction */ + OffsetNumber minoff; /* Lowest non-pivot tuple's offset */ + OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ + IndexTuple finaltup; /* Needed by scans with array keys */ + BlockNumber prev_scan_page; /* previous _bt_parallel_release block */ + Page page; /* Page being read */ + + /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ + OffsetNumber offnum; /* current tuple's page offset number */ + + /* Output parameter, set by _bt_checkkeys for _bt_readpage */ + OffsetNumber skip; /* Array keys "look ahead" skip offnum */ + bool continuescan; /* Terminate ongoing (primitive) index scan? */ + + /* + * Input and output parameters, set and unset by both _bt_readpage and + * _bt_checkkeys to manage precheck optimizations + */ + bool prechecked; /* precheck set continuescan to 'true'? */ + bool firstmatch; /* at least one match so far? */ + + /* + * Private _bt_checkkeys state used to manage "look ahead" optimization + * (only used during scans with array keys) + */ + int16 rechecks; + int16 targetdistance; + +} BTReadPageState; + /* * We use some private sk_flags bits in preprocessed scan keys. We're allowed * to use bits 16-31 (see skey.h). The uppermost bits are copied from the @@ -1128,7 +1169,7 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, bool indexUnchanged, struct IndexInfo *indexInfo); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); -extern Size btestimateparallelscan(void); +extern Size btestimateparallelscan(int nkeys, int norderbys); extern void btinitparallelscan(void *target); extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); @@ -1149,10 +1190,12 @@ extern bool btcanreturn(Relation index, int attno); /* * prototypes for internal functions in nbtree.c */ -extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno); +extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, + bool first); extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); -extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); +extern void _bt_parallel_primscan_schedule(IndexScanDesc scan, + BlockNumber prev_scan_page); /* * prototypes for functions in nbtdedup.c @@ -1243,15 +1286,11 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); */ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); -extern void _bt_preprocess_array_keys(IndexScanDesc scan); +extern bool _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); -extern void _bt_mark_array_keys(IndexScanDesc scan); -extern void _bt_restore_array_keys(IndexScanDesc scan); extern void _bt_preprocess_keys(IndexScanDesc scan); -extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, - int tupnatts, ScanDirection dir, bool *continuescan, - bool requiredMatchedByPrecheck, bool haveFirstMatch); +extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, + IndexTuple tuple, int tupnatts); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 2fa4c4fc1b..f2563ad1cb 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -117,6 +117,9 @@ typedef struct VariableStatData * Callers should initialize all fields of GenericCosts to zero. In addition, * they can set numIndexTuples to some positive value if they have a better * than default way of estimating the number of leaf index tuples visited. + * Similarly, they can set num_sa_scans to some value >= 1 for an index AM + * that doesn't necessarily perform exactly one primitive index scan per + * distinct combination of ScalarArrayOp array elements. */ typedef struct { diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 8311a03c3d..510646cbce 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -189,6 +189,58 @@ select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limi 48 | 8 (1 row) +-- +-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants +-- +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); + QUERY PLAN +------------------------------------------------------------------ + Unique + -> Index Only Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[])) +(3 rows) + +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); + hundred +--------- + 47 + 48 + 72 + 82 +(4 rows) + +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; + QUERY PLAN +------------------------------------------------------------------ + Unique + -> Index Only Scan Backward using tenk1_hundred on tenk1 + Index Cond: (hundred = ANY ('{47,48,72,82}'::integer[])) +(3 rows) + +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; + hundred +--------- + 82 + 72 + 48 + 47 +(4 rows) + +explain (costs off) +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; + QUERY PLAN +--------------------------------------------------------------------------------------- + Index Only Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand = ANY ('{364,366,380}'::integer[])) AND (tenthous = 200000)) +(2 rows) + +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; + thousand +---------- +(0 rows) + -- -- Check correct optimization of LIKE (special index operator support) -- for both indexscan and bitmapscan cases diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 70ab47a92f..cf6eac5734 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1698,6 +1698,12 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; 0 (1 row) +SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1); + count +------- + 1 +(1 row) + DROP INDEX onek_nulltest; CREATE UNIQUE INDEX onek_nulltest ON onek_with_null (unique2 desc nulls last,unique1); SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL; @@ -1910,7 +1916,7 @@ SELECT count(*) FROM dupindexcols (1 row) -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) SELECT unique1 FROM tenk1 @@ -1932,49 +1938,186 @@ ORDER BY unique1; 42 (3 rows) +-- Non-required array scan key on "tenthous": explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------- Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: (thousand < 2) - Filter: (tenthous = ANY ('{1001,3000}'::integer[])) + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand; + thousand | tenthous +----------+---------- + 0 | 3000 + 1 | 1001 +(2 rows) + +-- Non-required array scan key on "tenthous", backward scan: +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + QUERY PLAN +-------------------------------------------------------------------------------- + Index Only Scan Backward using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + thousand | tenthous +----------+---------- + 1 | 1001 + 0 | 3000 +(2 rows) + +-- +-- Check elimination of redundant and contradictory index quals +-- +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = ANY ('{7,8,9}'::integer[]))) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + unique1 +--------- + 7 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + QUERY PLAN +---------------------------------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{7,14,22}'::integer[])) AND (unique1 = ANY ('{33,44}'::bigint[]))) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + QUERY PLAN +--------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 1)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + unique1 +--------- + 1 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + QUERY PLAN +------------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 = 12345)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + QUERY PLAN +----------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 >= 42)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + unique1 +--------- + 42 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + QUERY PLAN +---------------------------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 > 42)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + unique1 +--------- +(0 rows) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + QUERY PLAN +-------------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 > 9996) AND (unique1 >= 9999)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + unique1 +--------- + 9999 +(1 row) + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + QUERY PLAN +-------------------------------------------------- + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 < 3) AND (unique1 <= 3)) +(2 rows) + +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + unique1 +--------- + 0 + 1 + 2 (3 rows) -SELECT thousand, tenthous FROM tenk1 -WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; - thousand | tenthous -----------+---------- - 0 | 3000 - 1 | 1001 +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + QUERY PLAN +------------------------------------------------------------ + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 < 3) AND (unique1 < '-1'::bigint)) (2 rows) -SET enable_indexonlyscan = OFF; +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + unique1 +--------- +(0 rows) + explain (costs off) -SELECT thousand, tenthous FROM tenk1 -WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; QUERY PLAN -------------------------------------------------------------------------------------- - Sort - Sort Key: thousand - -> Index Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(4 rows) - -SELECT thousand, tenthous FROM tenk1 -WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; - thousand | tenthous -----------+---------- - 0 | 3000 - 1 | 1001 + Index Only Scan using tenk1_unique1 on tenk1 + Index Cond: ((unique1 = ANY ('{1,42,7}'::integer[])) AND (unique1 < '-1'::bigint)) (2 rows) -RESET enable_indexonlyscan; +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; + unique1 +--------- +(0 rows) + -- -- Check elimination of constant-NULL subexpressions -- diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 63cddac0d6..8b640c2fc2 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -8880,10 +8880,9 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]); Merge Cond: (j1.id1 = j2.id1) Join Filter: (j2.id2 = j1.id2) -> Index Scan using j1_id1_idx on j1 - -> Index Only Scan using j2_pkey on j2 + -> Index Scan using j2_id1_idx on j2 Index Cond: (id1 >= ANY ('{1,5}'::integer[])) - Filter: ((id1 % 1000) = 1) -(7 rows) +(6 rows) select * from j1 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out index 4ffc5b4c56..87273fa635 100644 --- a/src/test/regress/expected/select_parallel.out +++ b/src/test/regress/expected/select_parallel.out @@ -361,6 +361,7 @@ alter table tenk2 reset (parallel_workers); -- test parallel index scans. set enable_seqscan to off; set enable_bitmapscan to off; +set random_page_cost = 2; explain (costs off) select count((unique1)) from tenk1 where hundred > 1; QUERY PLAN @@ -379,6 +380,30 @@ select count((unique1)) from tenk1 where hundred > 1; 9800 (1 row) +-- Parallel ScalarArrayOp index scan +explain (costs off) + select count((unique1)) from tenk1 + where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); + QUERY PLAN +--------------------------------------------------------------------- + Finalize Aggregate + InitPlan 1 + -> Aggregate + -> Function Scan on generate_series i + -> Gather + Workers Planned: 4 + -> Partial Aggregate + -> Parallel Index Scan using tenk1_hundred on tenk1 + Index Cond: (hundred = ANY ((InitPlan 1).col1)) +(9 rows) + +select count((unique1)) from tenk1 +where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); + count +------- + 700 +(1 row) + -- test parallel index-only scans. explain (costs off) select count(*) from tenk1 where thousand > 95; diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index ef84354234..0d2a33f370 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -135,6 +135,21 @@ explain (costs off) select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1; select hundred, twenty from tenk1 where hundred <= 48 order by hundred desc limit 1; +-- +-- Add coverage for ScalarArrayOp btree quals with pivot tuple constants +-- +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82); + +explain (costs off) +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; +select distinct hundred from tenk1 where hundred in (47, 48, 72, 82) order by hundred desc; + +explain (costs off) +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; +select thousand from tenk1 where thousand in (364, 366,380) and tenthous = 200000; + -- -- Check correct optimization of LIKE (special index operator support) -- for both indexscan and bitmapscan cases diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index d49ce9f300..e296891cab 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -668,6 +668,7 @@ SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IS NOT NULL; SELECT count(*) FROM onek_with_null WHERE unique1 IS NOT NULL AND unique1 > 500; SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique1 > 500; +SELECT count(*) FROM onek_with_null WHERE unique1 IS NULL AND unique2 IN (-1, 0, 1); DROP INDEX onek_nulltest; @@ -753,7 +754,7 @@ SELECT count(*) FROM dupindexcols WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX'; -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) @@ -765,6 +766,7 @@ SELECT unique1 FROM tenk1 WHERE unique1 IN (1,42,7) ORDER BY unique1; +-- Non-required array scan key on "tenthous": explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -774,18 +776,68 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; -SET enable_indexonlyscan = OFF; - +-- Non-required array scan key on "tenthous", backward scan: explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +ORDER BY thousand DESC, tenthous DESC; SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) -ORDER BY thousand; +ORDER BY thousand DESC, tenthous DESC; -RESET enable_indexonlyscan; +-- +-- Check elimination of redundant and contradictory index quals +-- +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = ANY('{7, 8, 9}'); + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + +SELECT unique1 FROM tenk1 WHERE unique1 = ANY('{7, 14, 22}') and unique1 = ANY('{33, 44}'::bigint[]); + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 1; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 = 12345; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 >= 42; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 > 42; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + +SELECT unique1 FROM tenk1 WHERE unique1 > 9996 and unique1 >= 9999; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 <= 3; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + +SELECT unique1 FROM tenk1 WHERE unique1 < 3 and unique1 < (-1)::bigint; + +explain (costs off) +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; + +SELECT unique1 FROM tenk1 WHERE unique1 IN (1, 42, 7) and unique1 < (-1)::bigint; -- -- Check elimination of constant-NULL subexpressions diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql index c43a5b2119..20376c03fa 100644 --- a/src/test/regress/sql/select_parallel.sql +++ b/src/test/regress/sql/select_parallel.sql @@ -137,11 +137,19 @@ alter table tenk2 reset (parallel_workers); -- test parallel index scans. set enable_seqscan to off; set enable_bitmapscan to off; +set random_page_cost = 2; explain (costs off) select count((unique1)) from tenk1 where hundred > 1; select count((unique1)) from tenk1 where hundred > 1; +-- Parallel ScalarArrayOp index scan +explain (costs off) + select count((unique1)) from tenk1 + where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); +select count((unique1)) from tenk1 +where hundred = any ((select array_agg(i) from generate_series(1, 100, 15) i)::int[]); + -- test parallel index-only scans. explain (costs off) select count(*) from tenk1 where thousand > 95; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 01845ee71d..f87e8b80ec 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -208,8 +208,10 @@ BTPageStat BTPageState BTParallelScanDesc BTPendingFSM +BTReadPageState BTScanInsert BTScanInsertData +BTScanKeyPreproc BTScanOpaque BTScanOpaqueData BTScanPos