From a2822fb9337a21f98ac4ce850bb4145acf47ca27 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 7 Oct 2011 20:13:02 -0400 Subject: [PATCH] Support index-only scans using the visibility map to avoid heap fetches. When a btree index contains all columns required by the query, and the visibility map shows that all tuples on a target heap page are visible-to-all, we don't need to fetch that heap page. This patch depends on the previous patches that made the visibility map reliable. There's a fair amount left to do here, notably trying to figure out a less chintzy way of estimating the cost of an index-only scan, but the core functionality seems ready to commit. Robert Haas and Ibrar Ahmed, with some previous work by Heikki Linnakangas. --- doc/src/sgml/catalogs.sgml | 7 + doc/src/sgml/config.sgml | 19 +- doc/src/sgml/indexam.sgml | 32 ++- doc/src/sgml/ref/postgres-ref.sgml | 7 +- src/backend/access/index/genam.c | 4 + src/backend/access/index/indexam.c | 236 +++++++++++------- src/backend/access/nbtree/nbtree.c | 92 +++++++ src/backend/commands/explain.c | 5 +- src/backend/executor/nodeIndexscan.c | 123 ++++++++- src/backend/nodes/copyfuncs.c | 1 + src/backend/nodes/outfuncs.c | 2 + src/backend/optimizer/path/costsize.c | 21 ++ src/backend/optimizer/path/indxpath.c | 89 +++++++ src/backend/optimizer/plan/createplan.c | 9 +- src/backend/optimizer/plan/planner.c | 2 +- src/backend/optimizer/util/pathnode.c | 8 +- src/backend/optimizer/util/plancat.c | 1 + src/backend/optimizer/util/var.c | 43 ++-- src/backend/tcop/postgres.c | 5 +- src/backend/utils/cache/relcache.c | 4 +- src/backend/utils/misc/guc.c | 9 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/genam.h | 3 + src/include/access/relscan.h | 5 + src/include/catalog/catversion.h | 2 +- src/include/catalog/pg_am.h | 52 ++-- src/include/nodes/execnodes.h | 2 + src/include/nodes/plannodes.h | 5 + src/include/nodes/relation.h | 6 + src/include/optimizer/cost.h | 4 +- src/include/optimizer/pathnode.h | 1 + src/include/optimizer/var.h | 2 +- src/test/regress/expected/aggregates.out | 78 +++--- src/test/regress/expected/rangefuncs.out | 27 +- 34 files changed, 704 insertions(+), 203 deletions(-) diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index d6baf84248..0495bd03bd 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -476,6 +476,13 @@ Does the access method support multicolumn indexes? + + amcanreturn + bool + + Can the access method return the contents of index entries? + + amoptionalkey bool diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index fbcd455694..d3a8b2648d 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2404,6 +2404,22 @@ SET ENABLE_SEQSCAN TO OFF; + + enable_indexonlyscan (boolean) + + index-only scan + + + enable_indexonlyscan configuration parameter + + + + Enables or disables the query planner's use of index-only-scan plan + types. The default is on. + + + + enable_material (boolean) @@ -6353,7 +6369,7 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1) , , , - , , + , , , , @@ -6362,6 +6378,7 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1) enable_indexscan = off, enable_mergejoin = off, enable_nestloop = off, + enable_indexonlyscan = off, enable_seqscan = off, enable_tidscan = off diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index bb94258385..724b413755 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -134,6 +134,11 @@ amsearchnulls, indicating that it supports IS NULL and IS NOT NULL clauses as search conditions. + An index method can also set amcanreturn, + indicating that it can support index-only scans by returning + the indexed column values for an index entry in the form of an IndexTuple. + (An example of an index AM that cannot do this is hash, which stores only + the hash values not the original data.) @@ -385,6 +390,18 @@ amgettuple (IndexScanDesc scan, callers. + + If the access method supports index-only scans (i.e., + amcanreturn is TRUE in its pg_am + row), then on success it must also check + scan->xs_want_itup, and if that is true it should return + the original indexed data for the index entry, in the form of an + IndexTuple stored at scan->xs_itup. However, + it is permissible for the access method to sometimes fail to provide this + data, in which case it must set scan->xs_itup to NULL. That + will result in a regular heap fetch occurring. + + The amgettuple function need only be provided if the access method supports plain index scans. If it doesn't, the @@ -581,6 +598,15 @@ amrestrpos (IndexScanDesc scan); deleted. + + If the index stores the original indexed data values (and not some lossy + representation of them), it is useful to support index-only scans, in + which the index returns the actual data not just the TID of the heap tuple. + This will only work if the visibility map shows that the TID is on an + all-visible page; else the heap tuple must be visited anyway to check + MVCC visibility. But that is no concern of the access method's. + + Instead of using amgettuple, an index scan can be done with amgetbitmap to fetch all tuples in one call. This can be @@ -593,7 +619,11 @@ amrestrpos (IndexScanDesc scan); supported. Secondly, the tuples are returned in a bitmap which doesn't have any specific ordering, which is why amgetbitmap doesn't take a direction argument. (Ordering operators will never be - supplied for such a scan, either.) Finally, amgetbitmap + supplied for such a scan, either.) + Also, there is no provision for index-only scans with + amgetbitmap, since there is no way to return the contents of + index tuples. + Finally, amgetbitmap does not guarantee any locking of the returned tuples, with implications spelled out in . diff --git a/doc/src/sgml/ref/postgres-ref.sgml b/doc/src/sgml/ref/postgres-ref.sgml index 3807f4090a..9869a1f526 100644 --- a/doc/src/sgml/ref/postgres-ref.sgml +++ b/doc/src/sgml/ref/postgres-ref.sgml @@ -376,12 +376,15 @@ PostgreSQL documentation - { s | i | m | n | h } + { s | i | o | b | t | n | m | h } Forbids the use of particular scan and join methods: s and i - disable sequential and index scans respectively, while + disable sequential and index scans respectively, + o, b and t + disable index-only scans, bitmap index scans, and TID scans + respectively, while n, m, and h disable nested-loop, merge and hash joins respectively. diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 98832adb8a..236e48912b 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -93,6 +93,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) else scan->orderByData = NULL; + scan->xs_want_itup = false; /* may be set later */ + /* * During recovery we ignore killed tuples and don't bother to kill them * either. We do this because the xmin on the primary node could easily be @@ -109,6 +111,8 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->opaque = NULL; + scan->xs_itup = NULL; + ItemPointerSetInvalid(&scan->xs_ctup.t_self); scan->xs_ctup.t_data = NULL; scan->xs_cbuf = InvalidBuffer; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 230af9bfa3..3e0797a5c2 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -20,7 +20,9 @@ * index_insert - insert an index tuple into a relation * index_markpos - mark a scan position * index_restrpos - restore a scan position - * index_getnext - get the next tuple from a scan + * index_getnext_tid - get the next TID from a scan + * index_fetch_heap - get the scan's next heap tuple + * index_getnext - get the next heap tuple from a scan * index_getbitmap - get all tuples from a scan * index_bulk_delete - bulk deletion of index tuples * index_vacuum_cleanup - post-deletion cleanup of an index @@ -422,13 +424,143 @@ index_restrpos(IndexScanDesc scan) FunctionCall1(procedure, PointerGetDatum(scan)); } +/* ---------------- + * index_getnext_tid - get the next TID from a scan + * + * The result is the next TID satisfying the scan keys, + * or NULL if no more matching tuples exist. + * ---------------- + */ +ItemPointer +index_getnext_tid(IndexScanDesc scan, ScanDirection direction) +{ + FmgrInfo *procedure; + bool found; + + SCAN_CHECKS; + GET_SCAN_PROCEDURE(amgettuple); + + Assert(TransactionIdIsValid(RecentGlobalXmin)); + + /* + * The AM's gettuple proc finds the next index entry matching the scan + * keys, and puts the TID in xs_ctup.t_self. It should also set + * scan->xs_recheck, though we pay no attention to that here. + */ + found = DatumGetBool(FunctionCall2(procedure, + PointerGetDatum(scan), + Int32GetDatum(direction))); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* ... but first, release any held pin on a heap page */ + if (BufferIsValid(scan->xs_cbuf)) + { + ReleaseBuffer(scan->xs_cbuf); + scan->xs_cbuf = InvalidBuffer; + } + return NULL; + } + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the TID of the tuple we found. */ + return &scan->xs_ctup.t_self; +} + +/* ---------------- + * index_fetch_heap - get the scan's next heap tuple + * + * The result is a visible heap tuple associated with the index TID most + * recently fetched by index_getnext_tid, or NULL if no more matching tuples + * exist. (There can be more than one matching tuple because of HOT chains, + * although when using an MVCC snapshot it should be impossible for more than + * one such tuple to exist.) + * + * On success, the buffer containing the heap tup is pinned (the pin will be + * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan + * call). + * + * Note: caller must check scan->xs_recheck, and perform rechecking of the + * scan keys if required. We do not do that here because we don't have + * enough information to do it efficiently in the general case. + * ---------------- + */ +HeapTuple +index_fetch_heap(IndexScanDesc scan) +{ + ItemPointer tid = &scan->xs_ctup.t_self; + bool all_dead = false; + bool got_heap_tuple; + + /* We can skip the buffer-switching logic if we're in mid-HOT chain. */ + if (!scan->xs_continue_hot) + { + /* Switch to correct buffer if we don't have it already */ + Buffer prev_buf = scan->xs_cbuf; + + scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, + scan->heapRelation, + ItemPointerGetBlockNumber(tid)); + + /* + * Prune page, but only if we weren't already on this page + */ + if (prev_buf != scan->xs_cbuf) + heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, + RecentGlobalXmin); + } + + /* Obtain share-lock on the buffer so we can examine visibility */ + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); + got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, + scan->xs_cbuf, + scan->xs_snapshot, + &scan->xs_ctup, + &all_dead, + !scan->xs_continue_hot); + LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); + + if (got_heap_tuple) + { + /* + * Only in a non-MVCC snapshot can more than one member of the + * HOT chain be visible. + */ + scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); + pgstat_count_heap_fetch(scan->indexRelation); + return &scan->xs_ctup; + } + + /* We've reached the end of the HOT chain. */ + scan->xs_continue_hot = false; + + /* + * If we scanned a whole HOT chain and found only dead tuples, tell index + * AM to kill its entry for that TID (this will take effect in the next + * amgettuple call, in index_getnext_tid). We do not do this when in + * recovery because it may violate MVCC to do so. See comments in + * RelationGetIndexScan(). + */ + if (!scan->xactStartedInRecovery) + scan->kill_prior_tuple = all_dead; + + return NULL; +} + /* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the - * snapshot, or NULL if no more matching tuples exist. On success, - * the buffer containing the heap tuple is pinned (the pin will be dropped - * at the next index_getnext or index_endscan). + * snapshot, or NULL if no more matching tuples exist. + * + * On success, the buffer containing the heap tup is pinned (the pin will be + * dropped in a future index_getnext_tid, index_fetch_heap or index_endscan + * call). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have @@ -438,20 +570,11 @@ index_restrpos(IndexScanDesc scan) HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { - HeapTuple heapTuple = &scan->xs_ctup; - ItemPointer tid = &heapTuple->t_self; - FmgrInfo *procedure; - bool all_dead = false; - - SCAN_CHECKS; - GET_SCAN_PROCEDURE(amgettuple); - - Assert(TransactionIdIsValid(RecentGlobalXmin)); + HeapTuple heapTuple; + ItemPointer tid; for (;;) { - bool got_heap_tuple; - if (scan->xs_continue_hot) { /* @@ -459,86 +582,27 @@ index_getnext(IndexScanDesc scan, ScanDirection direction) * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); - Assert(ItemPointerGetBlockNumber(tid) == + Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) == BufferGetBlockNumber(scan->xs_cbuf)); } else { - bool found; - Buffer prev_buf; + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scan, direction); - /* - * If we scanned a whole HOT chain and found only dead tuples, - * tell index AM to kill its entry for that TID. We do not do this - * when in recovery because it may violate MVCC to do so. see - * comments in RelationGetIndexScan(). - */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; - - /* - * The AM's gettuple proc finds the next index entry matching the - * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It - * should also set scan->xs_recheck, though we pay no attention to - * that here. - */ - found = DatumGetBool(FunctionCall2(procedure, - PointerGetDatum(scan), - Int32GetDatum(direction))); - - /* Reset kill flag immediately for safety */ - scan->kill_prior_tuple = false; - - /* If we're out of index entries, break out of outer loop */ - if (!found) + /* If we're out of index entries, we're done */ + if (tid == NULL) break; - - pgstat_count_index_tuples(scan->indexRelation, 1); - - /* Switch to correct buffer if we don't have it already */ - prev_buf = scan->xs_cbuf; - scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, - scan->heapRelation, - ItemPointerGetBlockNumber(tid)); - - /* - * Prune page, but only if we weren't already on this page - */ - if (prev_buf != scan->xs_cbuf) - heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, - RecentGlobalXmin); } - /* Obtain share-lock on the buffer so we can examine visibility */ - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); - got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation, - scan->xs_cbuf, - scan->xs_snapshot, - &scan->xs_ctup, - &all_dead, - !scan->xs_continue_hot); - LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); - - if (got_heap_tuple) - { - /* - * Only in a non-MVCC snapshot can more than one member of the - * HOT chain be visible. - */ - scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot); - pgstat_count_heap_fetch(scan->indexRelation); + /* + * Fetch the next (or only) visible heap tuple for this index entry. + * If we don't find anything, loop around and grab the next TID from + * the index. + */ + heapTuple = index_fetch_heap(scan); + if (heapTuple != NULL) return heapTuple; - } - - /* Loop around to ask index AM for another TID */ - scan->xs_continue_hot = false; - } - - /* Release any held pin on a heap page */ - if (BufferIsValid(scan->xs_cbuf)) - { - ReleaseBuffer(scan->xs_cbuf); - scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 996611516f..11b57659ee 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -73,6 +73,7 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTCycleId cycleid); static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno); +static IndexTuple bt_getindextuple(IndexScanDesc scan); /* @@ -310,9 +311,94 @@ btgettuple(PG_FUNCTION_ARGS) else res = _bt_first(scan, dir); + /* Return the whole index tuple if requested */ + if (scan->xs_want_itup) + { + /* First, free the last one ... */ + if (scan->xs_itup != NULL) + { + pfree(scan->xs_itup); + scan->xs_itup = NULL; + } + + if (res) + scan->xs_itup = bt_getindextuple(scan); + } + PG_RETURN_BOOL(res); } +/* + * bt_getindextuple - fetch index tuple at current position. + * + * This can fail to find the tuple if new tuples have been inserted on the + * index page since we stepped onto the page. NULL is returned in that case. + * (We could try a bit harder by searching for the TID; but if insertions + * are happening, it's reasonably likely that an index-only scan will fail + * anyway because of visibility. So probably not worth the trouble.) + * + * The tuple returned is a palloc'd copy, so that we don't need to keep a + * lock on the index page. + * + * The caller must have pin on so->currPos.buf. + */ +static IndexTuple +bt_getindextuple(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int itemIndex; + OffsetNumber offnum; + IndexTuple ituple, + result; + + Assert(BufferIsValid(so->currPos.buf)); + + LockBuffer(so->currPos.buf, BT_READ); + + /* Locate the tuple, being paranoid about possibility the page changed */ + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + itemIndex = so->currPos.itemIndex; + /* pure paranoia */ + Assert(itemIndex >= so->currPos.firstItem && + itemIndex <= so->currPos.lastItem); + + offnum = so->currPos.items[itemIndex].indexOffset; + if (offnum < minoff || offnum > maxoff) + { + /* should never happen, since we have pin on page, but be careful */ + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + return NULL; + } + + ituple = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + + if (ItemPointerEquals(&ituple->t_tid, &scan->xs_ctup.t_self)) + { + /* yup, it's the desired tuple, so make a copy */ + Size itupsz = IndexTupleSize(ituple); + + result = palloc(itupsz); + memcpy(result, ituple, itupsz); + } + else + { + /* oops, it got moved */ + result = NULL; + } + + LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + + return result; +} + /* * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap */ @@ -464,6 +550,12 @@ btendscan(PG_FUNCTION_ARGS) pfree(so->keyData); pfree(so); + if (scan->xs_itup != NULL) + { + pfree(scan->xs_itup); + scan->xs_itup = NULL; + } + PG_RETURN_VOID(); } diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index cd9fc92923..fbcaf6cbe0 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -656,7 +656,10 @@ ExplainNode(PlanState *planstate, List *ancestors, pname = sname = "Seq Scan"; break; case T_IndexScan: - pname = sname = "Index Scan"; + if (((IndexScan *) plan)->indexonly) + pname = sname = "Index Only Scan"; + else + pname = sname = "Index Scan"; break; case T_BitmapIndexScan: pname = sname = "Bitmap Index Scan"; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index da25384e86..32ed65797a 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -26,6 +26,7 @@ #include "access/nbtree.h" #include "access/relscan.h" +#include "access/visibilitymap.h" #include "executor/execdebug.h" #include "executor/nodeIndexscan.h" #include "optimizer/clauses.h" @@ -36,6 +37,7 @@ static TupleTableSlot *IndexNext(IndexScanState *node); +static void IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc); /* ---------------------------------------------------------------- @@ -54,6 +56,7 @@ IndexNext(IndexScanState *node) IndexScanDesc scandesc; HeapTuple tuple; TupleTableSlot *slot; + ItemPointer tid; /* * extract necessary information from index scan node @@ -73,19 +76,63 @@ IndexNext(IndexScanState *node) slot = node->ss.ss_ScanTupleSlot; /* - * ok, now that we have what we need, fetch the next tuple. + * OK, now that we have what we need, fetch the next TID. */ - while ((tuple = index_getnext(scandesc, direction)) != NULL) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { /* - * Store the scanned tuple in the scan tuple slot of the scan state. - * Note: we pass 'false' because tuples returned by amgetnext are - * pointers onto disk pages and must not be pfree()'d. + * Attempt index-only scan, if possible. For this, we need to have + * gotten an index tuple from the AM, and we need the TID to reference + * a heap page on which all tuples are known visible to everybody. + * If that's the case, we don't need to visit the heap page for tuple + * visibility testing, and we don't need any column values that are + * not available from the index. + * + * Note: in the index-only path, we are still holding pin on the + * scan's xs_cbuf, ie, the previously visited heap page. It's not + * clear whether it'd be better to release that pin. */ - ExecStoreTuple(tuple, /* tuple to store */ - slot, /* slot to store in */ - scandesc->xs_cbuf, /* buffer containing tuple */ - false); /* don't pfree */ + if (scandesc->xs_itup != NULL && + visibilitymap_test(scandesc->heapRelation, + ItemPointerGetBlockNumber(tid), + &node->iss_VMBuffer)) + { + /* + * Convert index tuple to look like a heap tuple, and store the + * results in the scan tuple slot. + */ + IndexStoreHeapTuple(slot, scandesc); + } + else + { + /* Index-only approach not possible, so fetch heap tuple. */ + tuple = index_fetch_heap(scandesc); + + /* Tuple might not be visible. */ + if (tuple == NULL) + continue; + + /* + * Only MVCC snapshots are supported here, so there should be no + * need to keep following the HOT chain once a visible entry has + * been found. If we did want to allow that, we'd need to keep + * more state to remember not to call index_getnext_tid next time. + */ + if (scandesc->xs_continue_hot) + elog(ERROR, "unsupported use of non-MVCC snapshot in executor"); + + /* + * Store the scanned tuple in the scan tuple slot of the scan + * state. + * + * Note: we pass 'false' because tuples returned by amgetnext are + * pointers onto disk pages and must not be pfree()'d. + */ + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + scandesc->xs_cbuf, /* buffer containing tuple */ + false); /* don't pfree */ + } /* * If the index was lossy, we have to recheck the index quals using @@ -113,6 +160,53 @@ IndexNext(IndexScanState *node) return ExecClearTuple(slot); } +/* + * IndexStoreHeapTuple + * + * When performing an index-only scan, we build a faux heap tuple + * from the index tuple. Columns not present in the index are set to + * NULL, which is OK because we know they won't be referenced. + * + * The faux tuple is built as a virtual tuple that depends on the + * scandesc's xs_itup, so that must remain valid for as long as we + * need the slot contents. + */ +static void +IndexStoreHeapTuple(TupleTableSlot *slot, IndexScanDesc scandesc) +{ + Form_pg_index indexForm = scandesc->indexRelation->rd_index; + TupleDesc indexDesc = RelationGetDescr(scandesc->indexRelation); + int nindexatts = indexDesc->natts; + int nheapatts = slot->tts_tupleDescriptor->natts; + Datum *values = slot->tts_values; + bool *isnull = slot->tts_isnull; + int i; + + /* We must first set the slot to empty, and mark all columns as null */ + ExecClearTuple(slot); + + memset(isnull, true, nheapatts * sizeof(bool)); + + /* Transpose index tuple into heap tuple. */ + for (i = 0; i < nindexatts; i++) + { + int indexatt = indexForm->indkey.values[i]; + + /* Ignore expression columns, as well as system attributes */ + if (indexatt <= 0) + continue; + + Assert(indexatt <= nheapatts); + + values[indexatt - 1] = index_getattr(scandesc->xs_itup, i + 1, + indexDesc, + &isnull[indexatt - 1]); + } + + /* And now we can mark the slot as holding a virtual tuple. */ + ExecStoreVirtualTuple(slot); +} + /* * IndexRecheck -- access method routine to recheck a tuple in EvalPlanQual */ @@ -399,6 +493,13 @@ ExecEndIndexScan(IndexScanState *node) indexScanDesc = node->iss_ScanDesc; relation = node->ss.ss_currentRelation; + /* Release VM buffer pin, if any. */ + if (node->iss_VMBuffer != InvalidBuffer) + { + ReleaseBuffer(node->iss_VMBuffer); + node->iss_VMBuffer = InvalidBuffer; + } + /* * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext */ @@ -611,6 +712,10 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->iss_NumScanKeys, indexstate->iss_NumOrderByKeys); + /* Prepare for possible index-only scan */ + indexstate->iss_ScanDesc->xs_want_itup = node->indexonly; + indexstate->iss_VMBuffer = InvalidBuffer; + /* * If no run-time keys to calculate, go ahead and pass the scankeys to the * index AM. diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 661a5162e6..5100642dd6 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -370,6 +370,7 @@ _copyIndexScan(IndexScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indexorderbyorig); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexonly); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 0d0ce3c203..9f56427774 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -447,6 +447,7 @@ _outIndexScan(StringInfo str, IndexScan *node) WRITE_NODE_FIELD(indexorderby); WRITE_NODE_FIELD(indexorderbyorig); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_BOOL_FIELD(indexonly); } static void @@ -1500,6 +1501,7 @@ _outIndexPath(StringInfo str, IndexPath *node) WRITE_NODE_FIELD(indexorderbys); WRITE_BOOL_FIELD(isjoininner); WRITE_ENUM_FIELD(indexscandir, ScanDirection); + WRITE_BOOL_FIELD(indexonly); WRITE_FLOAT_FIELD(indextotalcost, "%.2f"); WRITE_FLOAT_FIELD(indexselectivity, "%.4f"); WRITE_FLOAT_FIELD(rows, "%.0f"); diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 7812a8628f..e480797ca8 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -110,6 +110,7 @@ Cost disable_cost = 1.0e10; bool enable_seqscan = true; bool enable_indexscan = true; +bool enable_indexonlyscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; @@ -119,6 +120,9 @@ bool enable_material = true; bool enable_mergejoin = true; bool enable_hashjoin = true; +/* Possibly this should become a GUC too */ +static double visibility_fraction = 0.9; + typedef struct { PlannerInfo *root; @@ -210,6 +214,7 @@ cost_seqscan(Path *path, PlannerInfo *root, * 'index' is the index to be used * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics) * 'indexOrderBys' is the list of ORDER BY operators for amcanorderbyop indexes + * 'indexonly' is true if it's an index-only scan * 'outer_rel' is the outer relation when we are considering using the index * scan as the inside of a nestloop join (hence, some of the indexQuals * are join clauses, and we should expect repeated scans of the index); @@ -232,6 +237,7 @@ cost_index(IndexPath *path, PlannerInfo *root, IndexOptInfo *index, List *indexQuals, List *indexOrderBys, + bool indexonly, RelOptInfo *outer_rel) { RelOptInfo *baserel = index->rel; @@ -314,6 +320,12 @@ cost_index(IndexPath *path, PlannerInfo *root, * For partially-correlated indexes, we ought to charge somewhere between * these two estimates. We currently interpolate linearly between the * estimates based on the correlation squared (XXX is that appropriate?). + * + * If it's an index-only scan, then we will not need to fetch any heap + * pages for which the visibility map shows all tuples are visible. + * Unfortunately, we have no stats as to how much of the heap is + * all-visible, and that's likely to be a rather unstable number anyway. + * We use an arbitrary constant visibility_fraction to estimate this. *---------- */ if (outer_rel != NULL && outer_rel->rows > 1) @@ -333,6 +345,8 @@ cost_index(IndexPath *path, PlannerInfo *root, (double) index->pages, root); + pages_fetched = ceil(pages_fetched * visibility_fraction); + max_IO_cost = (pages_fetched * spc_random_page_cost) / num_scans; /* @@ -352,6 +366,8 @@ cost_index(IndexPath *path, PlannerInfo *root, (double) index->pages, root); + pages_fetched = ceil(pages_fetched * visibility_fraction); + min_IO_cost = (pages_fetched * spc_random_page_cost) / num_scans; } else @@ -365,11 +381,16 @@ cost_index(IndexPath *path, PlannerInfo *root, (double) index->pages, root); + pages_fetched = ceil(pages_fetched * visibility_fraction); + /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */ max_IO_cost = pages_fetched * spc_random_page_cost; /* min_IO_cost is for the perfectly correlated case (csquared=1) */ pages_fetched = ceil(indexSelectivity * (double) baserel->pages); + + pages_fetched = ceil(pages_fetched * visibility_fraction); + min_IO_cost = spc_random_page_cost; if (pages_fetched > 1) min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index fc96f4f1da..7090a7e0c0 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -18,6 +18,7 @@ #include #include "access/skey.h" +#include "access/sysattr.h" #include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_operator.h" @@ -88,6 +89,7 @@ static PathClauseUsage *classify_index_clause_usage(Path *path, List **clauselist); static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds); static int find_list_position(Node *node, List **nodelist); +static bool check_index_only(RelOptInfo *rel, IndexOptInfo *index); static List *group_clauses_by_indexkey(IndexOptInfo *index, List *clauses, List *outer_clauses, Relids outer_relids, @@ -314,6 +316,8 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, bool useful_predicate; bool found_clause; bool index_is_ordered; + bool index_only_scan = false; + bool checked_index_only = false; /* * Check that index supports the desired scan type(s) @@ -438,6 +442,10 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, */ if (found_clause || useful_pathkeys != NIL || useful_predicate) { + /* First, detect whether index-only scan is possible */ + index_only_scan = check_index_only(rel, index); + checked_index_only = true; + ipath = create_index_path(root, index, restrictclauses, orderbyclauses, @@ -445,6 +453,7 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, index_is_ordered ? ForwardScanDirection : NoMovementScanDirection, + index_only_scan, outer_rel); result = lappend(result, ipath); } @@ -462,11 +471,15 @@ find_usable_indexes(PlannerInfo *root, RelOptInfo *rel, index_pathkeys); if (useful_pathkeys != NIL) { + if (!checked_index_only) + index_only_scan = check_index_only(rel, index); + ipath = create_index_path(root, index, restrictclauses, NIL, useful_pathkeys, BackwardScanDirection, + index_only_scan, outer_rel); result = lappend(result, ipath); } @@ -1040,6 +1053,82 @@ find_list_position(Node *node, List **nodelist) } +/* + * check_index_only + * Determine whether an index-only scan is possible for this index. + */ +static bool +check_index_only(RelOptInfo *rel, IndexOptInfo *index) +{ + bool result; + Bitmapset *attrs_used = NULL; + Bitmapset *index_attrs = NULL; + ListCell *lc; + int i; + + /* Index-only scans must be enabled, and AM must be capable of it */ + if (!enable_indexonlyscan) + return false; + if (!index->amcanreturn) + return false; + + /* + * Check that all needed attributes of the relation are available from + * the index. + * + * XXX this is overly conservative for partial indexes, since we will + * consider attributes involved in the index predicate as required even + * though the predicate won't need to be checked at runtime. (The same + * is true for attributes used only in index quals, if we are certain + * that the index is not lossy.) However, it would be quite expensive + * to determine that accurately at this point, so for now we take the + * easy way out. + */ + + /* + * Add all the attributes needed for joins or final output. Note: we must + * look at reltargetlist, not the attr_needed data, because attr_needed + * isn't computed for inheritance child rels. + */ + pull_varattnos((Node *) rel->reltargetlist, rel->relid, &attrs_used); + + /* Add all the attributes used by restriction clauses. */ + foreach(lc, rel->baserestrictinfo) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + + pull_varattnos((Node *) rinfo->clause, rel->relid, &attrs_used); + } + + /* Construct a bitmapset of columns stored in the index. */ + for (i = 0; i < index->ncolumns; i++) + { + int attno = index->indexkeys[i]; + + /* + * For the moment, we just ignore index expressions. It might be nice + * to do something with them, later. We also ignore index columns + * that are system columns (such as OID), because the virtual-tuple + * coding used by IndexStoreHeapTuple() can't deal with them. + */ + if (attno <= 0) + continue; + + index_attrs = + bms_add_member(index_attrs, + attno - FirstLowInvalidHeapAttributeNumber); + } + + /* Do we have all the necessary attributes? */ + result = bms_is_subset(attrs_used, index_attrs); + + bms_free(attrs_used); + bms_free(index_attrs); + + return result; +} + + /**************************************************************************** * ---- ROUTINES TO CHECK RESTRICTIONS ---- ****************************************************************************/ diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 0dedebaf70..36ee7c5648 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -95,7 +95,7 @@ static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid); static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, - ScanDirection indexscandir); + ScanDirection indexscandir, bool indexonly); static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig); @@ -1183,7 +1183,8 @@ create_indexscan_plan(PlannerInfo *root, stripped_indexquals, fixed_indexorderbys, indexorderbys, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexonly); copy_path_costsize(&scan_plan->scan.plan, &best_path->path); /* use the indexscan-specific rows estimate, not the parent rel's */ @@ -2841,7 +2842,8 @@ make_indexscan(List *qptlist, List *indexqualorig, List *indexorderby, List *indexorderbyorig, - ScanDirection indexscandir) + ScanDirection indexscandir, + bool indexonly) { IndexScan *node = makeNode(IndexScan); Plan *plan = &node->scan.plan; @@ -2858,6 +2860,7 @@ make_indexscan(List *qptlist, node->indexorderby = indexorderby; node->indexorderbyorig = indexorderbyorig; node->indexorderdir = indexscandir; + node->indexonly = indexonly; return node; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b4982746a2..5c18b72344 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3297,7 +3297,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) /* Estimate the cost of index scan */ indexScanPath = create_index_path(root, indexInfo, NIL, NIL, NIL, - ForwardScanDirection, NULL); + ForwardScanDirection, false, NULL); return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost); } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 4a1c94affb..8ed55a3d0e 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -25,8 +25,8 @@ #include "optimizer/paths.h" #include "optimizer/tlist.h" #include "parser/parsetree.h" -#include "utils/selfuncs.h" #include "utils/lsyscache.h" +#include "utils/selfuncs.h" static List *translate_sub_tlist(List *tlist, int relid); @@ -418,6 +418,7 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel) * 'indexscandir' is ForwardScanDirection or BackwardScanDirection * for an ordered index, or NoMovementScanDirection for * an unordered index. + * 'indexonly' is true if an index-only scan is wanted. * 'outer_rel' is the outer relation if this is a join inner indexscan path. * (pathkeys and indexscandir are ignored if so.) NULL if not. * @@ -430,6 +431,7 @@ create_index_path(PlannerInfo *root, List *indexorderbys, List *pathkeys, ScanDirection indexscandir, + bool indexonly, RelOptInfo *outer_rel) { IndexPath *pathnode = makeNode(IndexPath); @@ -468,6 +470,7 @@ create_index_path(PlannerInfo *root, pathnode->isjoininner = (outer_rel != NULL); pathnode->indexscandir = indexscandir; + pathnode->indexonly = indexonly; if (outer_rel != NULL) { @@ -506,7 +509,8 @@ create_index_path(PlannerInfo *root, pathnode->rows = rel->rows; } - cost_index(pathnode, root, index, indexquals, indexorderbys, outer_rel); + cost_index(pathnode, root, index, indexquals, indexorderbys, + indexonly, outer_rel); return pathnode; } diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 8a3a5d85e2..742e7a880a 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -210,6 +210,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amcanorderbyop = indexRelation->rd_am->amcanorderbyop; + info->amcanreturn = indexRelation->rd_am->amcanreturn; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c index 8ce7ee41a1..888f6f67a8 100644 --- a/src/backend/optimizer/util/var.c +++ b/src/backend/optimizer/util/var.c @@ -34,6 +34,12 @@ typedef struct int sublevels_up; } pull_varnos_context; +typedef struct +{ + Bitmapset *varattnos; + Index varno; +} pull_varattnos_context; + typedef struct { int var_location; @@ -70,7 +76,7 @@ typedef struct static bool pull_varnos_walker(Node *node, pull_varnos_context *context); -static bool pull_varattnos_walker(Node *node, Bitmapset **varattnos); +static bool pull_varattnos_walker(Node *node, pull_varattnos_context *context); static bool contain_var_clause_walker(Node *node, void *context); static bool contain_vars_of_level_walker(Node *node, int *sublevels_up); static bool locate_var_of_level_walker(Node *node, @@ -177,23 +183,31 @@ pull_varnos_walker(Node *node, pull_varnos_context *context) * pull_varattnos * Find all the distinct attribute numbers present in an expression tree, * and add them to the initial contents of *varattnos. - * Only Vars that reference RTE 1 of rtable level zero are considered. + * Only Vars of the given varno and rtable level zero are considered. * * Attribute numbers are offset by FirstLowInvalidHeapAttributeNumber so that * we can include system attributes (e.g., OID) in the bitmap representation. * - * Currently, this does not support subqueries nor expressions containing - * references to multiple tables; not needed since it's only applied to - * index expressions and predicates. + * Currently, this does not support unplanned subqueries; that is not needed + * for current uses. It will handle already-planned SubPlan nodes, though, + * looking into only the "testexpr" and the "args" list. (The subplan cannot + * contain any other references to Vars of the current level.) */ void -pull_varattnos(Node *node, Bitmapset **varattnos) +pull_varattnos(Node *node, Index varno, Bitmapset **varattnos) { - (void) pull_varattnos_walker(node, varattnos); + pull_varattnos_context context; + + context.varattnos = *varattnos; + context.varno = varno; + + (void) pull_varattnos_walker(node, &context); + + *varattnos = context.varattnos; } static bool -pull_varattnos_walker(Node *node, Bitmapset **varattnos) +pull_varattnos_walker(Node *node, pull_varattnos_context *context) { if (node == NULL) return false; @@ -201,17 +215,18 @@ pull_varattnos_walker(Node *node, Bitmapset **varattnos) { Var *var = (Var *) node; - Assert(var->varno == 1); - *varattnos = bms_add_member(*varattnos, - var->varattno - FirstLowInvalidHeapAttributeNumber); + if (var->varno == context->varno && var->varlevelsup == 0) + context->varattnos = + bms_add_member(context->varattnos, + var->varattno - FirstLowInvalidHeapAttributeNumber); return false; } - /* Should not find a subquery or subplan */ + + /* Should not find an unplanned subquery */ Assert(!IsA(node, Query)); - Assert(!IsA(node, SubPlan)); return expression_tree_walker(node, pull_varattnos_walker, - (void *) varattnos); + (void *) context); } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 19d94b252c..976a832135 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3065,7 +3065,7 @@ set_debug_options(int debug_flag, GucContext context, GucSource source) bool set_plan_disabling_options(const char *arg, GucContext context, GucSource source) { - char *tmp = NULL; + const char *tmp = NULL; switch (arg[0]) { @@ -3075,6 +3075,9 @@ set_plan_disabling_options(const char *arg, GucContext context, GucSource source case 'i': /* indexscan */ tmp = "enable_indexscan"; break; + case 'o': /* indexonlyscan */ + tmp = "enable_indexonlyscan"; + break; case 'b': /* bitmapscan */ tmp = "enable_bitmapscan"; break; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index b22ad07ca8..9f6b12707b 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3684,10 +3684,10 @@ RelationGetIndexAttrBitmap(Relation relation) } /* Collect all attributes used in expressions, too */ - pull_varattnos((Node *) indexInfo->ii_Expressions, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); /* Collect all attributes in the index predicate, too */ - pull_varattnos((Node *) indexInfo->ii_Predicate, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); index_close(indexDesc, AccessShareLock); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 85fdad8996..106096faee 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -681,6 +681,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-only-scan plans."), + NULL + }, + &enable_indexonlyscan, + true, + NULL, NULL, NULL + }, { {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of bitmap-scan plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 5bb7e7117b..1d8bd3dd23 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -243,6 +243,7 @@ #enable_hashagg = on #enable_hashjoin = on #enable_indexscan = on +#enable_indexonlyscan = on #enable_material = on #enable_mergejoin = on #enable_nestloop = on diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 7154ae385b..dd62680dd5 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -144,6 +144,9 @@ extern void index_rescan(IndexScanDesc scan, extern void index_endscan(IndexScanDesc scan); extern void index_markpos(IndexScanDesc scan); extern void index_restrpos(IndexScanDesc scan); +extern ItemPointer index_getnext_tid(IndexScanDesc scan, + ScanDirection direction); +extern HeapTuple index_fetch_heap(IndexScanDesc scan); extern HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 57d08b9656..656aefccee 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -16,6 +16,7 @@ #include "access/genam.h" #include "access/heapam.h" +#include "access/itup.h" typedef struct HeapScanDescData @@ -66,6 +67,7 @@ typedef struct IndexScanDescData int numberOfOrderBys; /* number of ordering operators */ ScanKey keyData; /* array of index qualifier descriptors */ ScanKey orderByData; /* array of ordering op descriptors */ + bool xs_want_itup; /* caller requests index tuples */ /* signaling to index AM about killing index tuples */ bool kill_prior_tuple; /* last-returned tuple is dead */ @@ -76,6 +78,9 @@ typedef struct IndexScanDescData /* index access method's private state */ void *opaque; /* access-method-specific info */ + /* in an index-only scan, this is valid after a successful amgettuple */ + IndexTuple xs_itup; /* index tuple returned by AM, or NULL */ + /* xs_ctup/xs_cbuf/xs_recheck are valid after a successful index_getnext */ HeapTupleData xs_ctup; /* current heap tuple, if any */ Buffer xs_cbuf; /* current heap buffer in scan, if any */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index f3c8bb4142..e4eb7b1294 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201109071 +#define CATALOG_VERSION_NO 201110071 #endif diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h index feab4201de..c3c864f95f 100644 --- a/src/include/catalog/pg_am.h +++ b/src/include/catalog/pg_am.h @@ -45,6 +45,7 @@ CATALOG(pg_am,2601) bool amcanbackward; /* does AM support backward scan? */ bool amcanunique; /* does AM support UNIQUE indexes? */ bool amcanmulticol; /* does AM support multi-column indexes? */ + bool amcanreturn; /* can AM return IndexTuples? */ bool amoptionalkey; /* can query omit key for the first column? */ bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */ bool amstorage; /* can storage type differ from column type? */ @@ -78,7 +79,7 @@ typedef FormData_pg_am *Form_pg_am; * compiler constants for pg_am * ---------------- */ -#define Natts_pg_am 28 +#define Natts_pg_am 29 #define Anum_pg_am_amname 1 #define Anum_pg_am_amstrategies 2 #define Anum_pg_am_amsupport 3 @@ -87,42 +88,43 @@ typedef FormData_pg_am *Form_pg_am; #define Anum_pg_am_amcanbackward 6 #define Anum_pg_am_amcanunique 7 #define Anum_pg_am_amcanmulticol 8 -#define Anum_pg_am_amoptionalkey 9 -#define Anum_pg_am_amsearchnulls 10 -#define Anum_pg_am_amstorage 11 -#define Anum_pg_am_amclusterable 12 -#define Anum_pg_am_ampredlocks 13 -#define Anum_pg_am_amkeytype 14 -#define Anum_pg_am_aminsert 15 -#define Anum_pg_am_ambeginscan 16 -#define Anum_pg_am_amgettuple 17 -#define Anum_pg_am_amgetbitmap 18 -#define Anum_pg_am_amrescan 19 -#define Anum_pg_am_amendscan 20 -#define Anum_pg_am_ammarkpos 21 -#define Anum_pg_am_amrestrpos 22 -#define Anum_pg_am_ambuild 23 -#define Anum_pg_am_ambuildempty 24 -#define Anum_pg_am_ambulkdelete 25 -#define Anum_pg_am_amvacuumcleanup 26 -#define Anum_pg_am_amcostestimate 27 -#define Anum_pg_am_amoptions 28 +#define Anum_pg_am_amcanreturn 9 +#define Anum_pg_am_amoptionalkey 10 +#define Anum_pg_am_amsearchnulls 11 +#define Anum_pg_am_amstorage 12 +#define Anum_pg_am_amclusterable 13 +#define Anum_pg_am_ampredlocks 14 +#define Anum_pg_am_amkeytype 15 +#define Anum_pg_am_aminsert 16 +#define Anum_pg_am_ambeginscan 17 +#define Anum_pg_am_amgettuple 18 +#define Anum_pg_am_amgetbitmap 19 +#define Anum_pg_am_amrescan 20 +#define Anum_pg_am_amendscan 21 +#define Anum_pg_am_ammarkpos 22 +#define Anum_pg_am_amrestrpos 23 +#define Anum_pg_am_ambuild 24 +#define Anum_pg_am_ambuildempty 25 +#define Anum_pg_am_ambulkdelete 26 +#define Anum_pg_am_amvacuumcleanup 27 +#define Anum_pg_am_amcostestimate 28 +#define Anum_pg_am_amoptions 29 /* ---------------- * initial contents of pg_am * ---------------- */ -DATA(insert OID = 403 ( btree 5 1 t f t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions )); +DATA(insert OID = 403 ( btree 5 1 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcostestimate btoptions )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 -DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); +DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup hashcostestimate hashoptions )); DESCR("hash index access method"); #define HASH_AM_OID 405 -DATA(insert OID = 783 ( gist 0 8 f t f f t t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); +DATA(insert OID = 783 ( gist 0 8 f t f f t f t t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup gistcostestimate gistoptions )); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 5 f f f f t t f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 5 f f f f t f t f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup gincostestimate ginoptions )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index c8a0b59864..3885fa0099 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1226,6 +1226,7 @@ typedef struct * RuntimeContext expr context for evaling runtime Skeys * RelationDesc index relation descriptor * ScanDesc index scan descriptor + * VMBuffer buffer in use for visibility map testing, if any * ---------------- */ typedef struct IndexScanState @@ -1242,6 +1243,7 @@ typedef struct IndexScanState ExprContext *iss_RuntimeContext; Relation iss_RelationDesc; IndexScanDesc iss_ScanDesc; + Buffer iss_VMBuffer; } IndexScanState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 535eca77a7..60467f5276 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -300,6 +300,10 @@ typedef Scan SeqScan; * that the sort ordering is fully determinable from the top-level operators. * indexorderbyorig is unused at run time, but is needed for EXPLAIN. * (Note these fields are used for amcanorderbyop cases, not amcanorder cases.) + * + * indexorderdir specifies the scan ordering, for indexscans on amcanorder + * indexes (for other indexes it should be "don't care"). indexonly specifies + * an index-only scan, for indexscans on amcanreturn indexes. * ---------------- */ typedef struct IndexScan @@ -311,6 +315,7 @@ typedef struct IndexScan List *indexorderby; /* list of index ORDER BY exprs */ List *indexorderbyorig; /* the same in original form */ ScanDirection indexorderdir; /* forward or backward or don't care */ + bool indexonly; /* attempt to skip heap fetches? */ } IndexScan; /* ---------------- diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index ecbbc1cd39..cf48ba433c 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -482,6 +482,7 @@ typedef struct IndexOptInfo bool unique; /* true if a unique index */ bool hypothetical; /* true if index doesn't really exist */ bool amcanorderbyop; /* does AM support order by operator result? */ + bool amcanreturn; /* does AM know how to return tuples? */ bool amoptionalkey; /* can query omit key for the first column? */ bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */ bool amhasgettuple; /* does AM have amgettuple interface? */ @@ -672,6 +673,10 @@ typedef struct Path * NoMovementScanDirection for an indexscan, but the planner wants to * distinguish ordered from unordered indexes for building pathkeys.) * + * 'indexonly' is TRUE for an index-only scan, that is, the index's access + * method has amcanreturn = TRUE and we only need columns available from the + * index. + * * 'indextotalcost' and 'indexselectivity' are saved in the IndexPath so that * we need not recompute them when considering using the same index in a * bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath @@ -693,6 +698,7 @@ typedef struct IndexPath List *indexorderbys; bool isjoininner; ScanDirection indexscandir; + bool indexonly; Cost indextotalcost; Selectivity indexselectivity; double rows; /* estimated number of result tuples */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 604df335d2..125808ae98 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -52,6 +52,7 @@ extern PGDLLIMPORT int effective_cache_size; extern Cost disable_cost; extern bool enable_seqscan; extern bool enable_indexscan; +extern bool enable_indexonlyscan; extern bool enable_bitmapscan; extern bool enable_tidscan; extern bool enable_sort; @@ -67,7 +68,8 @@ extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, double index_pages, PlannerInfo *root); extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_index(IndexPath *path, PlannerInfo *root, IndexOptInfo *index, - List *indexQuals, List *indexOrderBys, RelOptInfo *outer_rel); + List *indexQuals, List *indexOrderBys, + bool indexonly, RelOptInfo *outer_rel); extern void cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, RelOptInfo *outer_rel); extern void cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root); diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index ee02732fe1..38c8c1c9a3 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -34,6 +34,7 @@ extern IndexPath *create_index_path(PlannerInfo *root, List *indexorderbys, List *pathkeys, ScanDirection indexscandir, + bool indexonly, RelOptInfo *outer_rel); extern BitmapHeapPath *create_bitmap_heap_path(PlannerInfo *root, RelOptInfo *rel, diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h index 5d7e2d93e9..4fd0052f2d 100644 --- a/src/include/optimizer/var.h +++ b/src/include/optimizer/var.h @@ -31,7 +31,7 @@ typedef enum } PVCPlaceHolderBehavior; extern Relids pull_varnos(Node *node); -extern void pull_varattnos(Node *node, Bitmapset **varattnos); +extern void pull_varattnos(Node *node, Index varno, Bitmapset **varattnos); extern bool contain_var_clause(Node *node); extern bool contain_vars_of_level(Node *node, int levelsup); extern int locate_var_of_level(Node *node, int levelsup); diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 69926f7b79..2324c7cda8 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -449,12 +449,12 @@ analyze tenk1; -- ensure we get consistent plans here -- Basic cases explain (costs off) select min(unique1) from tenk1; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------ Result InitPlan 1 (returns $0) -> Limit - -> Index Scan using tenk1_unique1 on tenk1 + -> Index Only Scan using tenk1_unique1 on tenk1 Index Cond: (unique1 IS NOT NULL) (5 rows) @@ -466,12 +466,12 @@ select min(unique1) from tenk1; explain (costs off) select max(unique1) from tenk1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: (unique1 IS NOT NULL) (5 rows) @@ -488,7 +488,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 < 42)) (5 rows) @@ -505,7 +505,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > 42)) (5 rows) @@ -522,7 +522,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique1 on tenk1 + -> Index Only Scan Backward using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > 42000)) (5 rows) @@ -535,12 +535,12 @@ select max(unique1) from tenk1 where unique1 > 42000; -- multi-column index (uses tenk1_thous_tenthous) explain (costs off) select max(tenthous) from tenk1 where thousand = 33; - QUERY PLAN --------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_thous_tenthous on tenk1 + -> Index Only Scan Backward using tenk1_thous_tenthous on tenk1 Index Cond: ((thousand = 33) AND (tenthous IS NOT NULL)) (5 rows) @@ -557,7 +557,7 @@ explain (costs off) Result InitPlan 1 (returns $0) -> Limit - -> Index Scan using tenk1_thous_tenthous on tenk1 + -> Index Only Scan using tenk1_thous_tenthous on tenk1 Index Cond: ((thousand = 33) AND (tenthous IS NOT NULL)) (5 rows) @@ -578,7 +578,7 @@ explain (costs off) -> Result InitPlan 1 (returns $1) -> Limit - -> Index Scan using tenk1_unique1 on tenk1 + -> Index Only Scan using tenk1_unique1 on tenk1 Index Cond: ((unique1 IS NOT NULL) AND (unique1 > int4_tbl.f1)) (7 rows) @@ -596,12 +596,12 @@ select f1, (select min(unique1) from tenk1 where unique1 > f1) AS gt -- check some cases that were handled incorrectly in 8.3.0 explain (costs off) select distinct max(unique2) from tenk1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- HashAggregate InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (6 rows) @@ -614,13 +614,13 @@ select distinct max(unique2) from tenk1; explain (costs off) select max(unique2) from tenk1 order by 1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: ($0) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -633,13 +633,13 @@ select max(unique2) from tenk1 order by 1; explain (costs off) select max(unique2) from tenk1 order by max(unique2); - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: ($0) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -652,13 +652,13 @@ select max(unique2) from tenk1 order by max(unique2); explain (costs off) select max(unique2) from tenk1 order by max(unique2)+1; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: (($0 + 1)) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -671,13 +671,13 @@ select max(unique2) from tenk1 order by max(unique2)+1; explain (costs off) select max(unique2), generate_series(1,3) as g from tenk1 order by g desc; - QUERY PLAN ----------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------------- Sort Sort Key: (generate_series(1, 3)) InitPlan 1 (returns $0) -> Limit - -> Index Scan Backward using tenk1_unique2 on tenk1 + -> Index Only Scan Backward using tenk1_unique2 on tenk1 Index Cond: (unique2 IS NOT NULL) -> Result (7 rows) @@ -705,32 +705,32 @@ insert into minmaxtest2 values(15), (16); insert into minmaxtest3 values(17), (18); explain (costs off) select min(f1), max(f1) from minmaxtest; - QUERY PLAN --------------------------------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------------------------------------------- Result InitPlan 1 (returns $0) -> Limit -> Merge Append Sort Key: public.minmaxtest.f1 - -> Index Scan using minmaxtesti on minmaxtest + -> Index Only Scan using minmaxtesti on minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest1i on minmaxtest1 minmaxtest + -> Index Only Scan using minmaxtest1i on minmaxtest1 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest2i on minmaxtest2 minmaxtest + -> Index Only Scan Backward using minmaxtest2i on minmaxtest2 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest3i on minmaxtest3 minmaxtest + -> Index Only Scan using minmaxtest3i on minmaxtest3 minmaxtest Index Cond: (f1 IS NOT NULL) InitPlan 2 (returns $1) -> Limit -> Merge Append Sort Key: public.minmaxtest.f1 - -> Index Scan Backward using minmaxtesti on minmaxtest + -> Index Only Scan Backward using minmaxtesti on minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest1i on minmaxtest1 minmaxtest + -> Index Only Scan Backward using minmaxtest1i on minmaxtest1 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan using minmaxtest2i on minmaxtest2 minmaxtest + -> Index Only Scan using minmaxtest2i on minmaxtest2 minmaxtest Index Cond: (f1 IS NOT NULL) - -> Index Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest + -> Index Only Scan Backward using minmaxtest3i on minmaxtest3 minmaxtest Index Cond: (f1 IS NOT NULL) (25 rows) diff --git a/src/test/regress/expected/rangefuncs.out b/src/test/regress/expected/rangefuncs.out index 51d561b28f..5f20c93249 100644 --- a/src/test/regress/expected/rangefuncs.out +++ b/src/test/regress/expected/rangefuncs.out @@ -1,17 +1,18 @@ SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%'; - name | setting --------------------+--------- - enable_bitmapscan | on - enable_hashagg | on - enable_hashjoin | on - enable_indexscan | on - enable_material | on - enable_mergejoin | on - enable_nestloop | on - enable_seqscan | on - enable_sort | on - enable_tidscan | on -(10 rows) + name | setting +----------------------+--------- + enable_bitmapscan | on + enable_hashagg | on + enable_hashjoin | on + enable_indexonlyscan | on + enable_indexscan | on + enable_material | on + enable_mergejoin | on + enable_nestloop | on + enable_seqscan | on + enable_sort | on + enable_tidscan | on +(11 rows) CREATE TABLE foo2(fooid int, f2 int); INSERT INTO foo2 VALUES(1, 11);