2011-10-11 20:20:06 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* nodeIndexonlyscan.c
|
|
|
|
* Routines to support index-only scans
|
|
|
|
*
|
2019-01-02 18:44:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
|
2011-10-11 20:20:06 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/backend/executor/nodeIndexonlyscan.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* INTERFACE ROUTINES
|
|
|
|
* ExecIndexOnlyScan scans an index
|
|
|
|
* IndexOnlyNext retrieve next tuple
|
|
|
|
* ExecInitIndexOnlyScan creates and initializes state info.
|
|
|
|
* ExecReScanIndexOnlyScan rescans the indexed relation.
|
|
|
|
* ExecEndIndexOnlyScan releases all storage.
|
|
|
|
* ExecIndexOnlyMarkPos marks scan position.
|
|
|
|
* ExecIndexOnlyRestrPos restores scan position.
|
2017-02-19 11:23:59 +01:00
|
|
|
* ExecIndexOnlyScanEstimate estimates DSM space needed for
|
|
|
|
* parallel index-only scan
|
|
|
|
* ExecIndexOnlyScanInitializeDSM initialize DSM for parallel
|
|
|
|
* index-only scan
|
Separate reinitialization of shared parallel-scan state from ExecReScan.
Previously, the parallel executor logic did reinitialization of shared
state within the ExecReScan code for parallel-aware scan nodes. This is
problematic, because it means that the ExecReScan call has to occur
synchronously (ie, during the parent Gather node's ReScan call). That is
swimming very much against the tide so far as the ExecReScan machinery is
concerned; the fact that it works at all today depends on a lot of fragile
assumptions, such as that no plan node between Gather and a parallel-aware
scan node is parameterized. Another objection is that because ExecReScan
might be called in workers as well as the leader, hacky extra tests are
needed in some places to prevent unwanted shared-state resets.
Hence, let's separate this code into two functions, a ReInitializeDSM
call and the ReScan call proper. ReInitializeDSM is called only in
the leader and is guaranteed to run before we start new workers.
ReScan is returned to its traditional function of resetting only local
state, which means that ExecReScan's usual habits of delaying or
eliminating child rescan calls are safe again.
As with the preceding commit 7df2c1f8d, it doesn't seem to be necessary
to make these changes in 9.6, which is a good thing because the FDW and
CustomScan APIs are impacted.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 19:18:16 +02:00
|
|
|
* ExecIndexOnlyScanReInitializeDSM reinitialize DSM for fresh scan
|
2017-02-19 11:23:59 +01:00
|
|
|
* ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker
|
2011-10-11 20:20:06 +02:00
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include "access/relscan.h"
|
|
|
|
#include "access/visibilitymap.h"
|
|
|
|
#include "executor/execdebug.h"
|
|
|
|
#include "executor/nodeIndexonlyscan.h"
|
|
|
|
#include "executor/nodeIndexscan.h"
|
2017-07-26 02:37:17 +02:00
|
|
|
#include "miscadmin.h"
|
2011-10-11 20:20:06 +02:00
|
|
|
#include "storage/bufmgr.h"
|
Fix serializable mode with index-only scans.
Serializable Snapshot Isolation used for serializable transactions
depends on acquiring SIRead locks on all heap relation tuples which
are used to generate the query result, so that a later delete or
update of any of the tuples can flag a read-write conflict between
transactions. This is normally handled in heapam.c, with tuple level
locking. Since an index-only scan avoids heap access in many cases,
building the result from the index tuple, the necessary predicate
locks were not being acquired for all tuples in an index-only scan.
To prevent problems with tuple IDs which are vacuumed and re-used
while the transaction still matters, the xmin of the tuple is part of
the tag for the tuple lock. Since xmin is not available to the
index-only scan for result rows generated from the index tuples, it
is not possible to acquire a tuple-level predicate lock in such
cases, in spite of having the tid. If we went to the heap to get the
xmin value, it would no longer be an index-only scan. Rather than
prohibit index-only scans under serializable transaction isolation,
we acquire an SIRead lock on the page containing the tuple, when it
was not necessary to visit the heap for other reasons.
Backpatch to 9.2.
Kevin Grittner and Tom Lane
2012-09-05 04:13:11 +02:00
|
|
|
#include "storage/predicate.h"
|
2011-10-11 20:20:06 +02:00
|
|
|
#include "utils/memutils.h"
|
|
|
|
#include "utils/rel.h"
|
|
|
|
|
|
|
|
|
|
|
|
static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
|
|
|
|
static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
|
2011-10-17 01:15:04 +02:00
|
|
|
TupleDesc itupdesc);
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* IndexOnlyNext
|
|
|
|
*
|
|
|
|
* Retrieve a tuple from the IndexOnlyScan node's index.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
static TupleTableSlot *
|
|
|
|
IndexOnlyNext(IndexOnlyScanState *node)
|
|
|
|
{
|
|
|
|
EState *estate;
|
|
|
|
ExprContext *econtext;
|
|
|
|
ScanDirection direction;
|
|
|
|
IndexScanDesc scandesc;
|
|
|
|
TupleTableSlot *slot;
|
|
|
|
ItemPointer tid;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* extract necessary information from index scan node
|
|
|
|
*/
|
|
|
|
estate = node->ss.ps.state;
|
|
|
|
direction = estate->es_direction;
|
|
|
|
/* flip direction if this is an overall backward scan */
|
|
|
|
if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir))
|
|
|
|
{
|
|
|
|
if (ScanDirectionIsForward(direction))
|
|
|
|
direction = BackwardScanDirection;
|
|
|
|
else if (ScanDirectionIsBackward(direction))
|
|
|
|
direction = ForwardScanDirection;
|
|
|
|
}
|
|
|
|
scandesc = node->ioss_ScanDesc;
|
|
|
|
econtext = node->ss.ps.ps_ExprContext;
|
|
|
|
slot = node->ss.ss_ScanTupleSlot;
|
|
|
|
|
2017-03-08 14:15:24 +01:00
|
|
|
if (scandesc == NULL)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We reach here if the index only scan is not parallel, or if we're
|
2018-07-19 15:08:09 +02:00
|
|
|
* serially executing an index only scan that was planned to be
|
|
|
|
* parallel.
|
2017-03-08 14:15:24 +01:00
|
|
|
*/
|
|
|
|
scandesc = index_beginscan(node->ss.ss_currentRelation,
|
|
|
|
node->ioss_RelationDesc,
|
|
|
|
estate->es_snapshot,
|
|
|
|
node->ioss_NumScanKeys,
|
|
|
|
node->ioss_NumOrderByKeys);
|
|
|
|
|
|
|
|
node->ioss_ScanDesc = scandesc;
|
|
|
|
|
|
|
|
|
|
|
|
/* Set it up for index-only scan */
|
|
|
|
node->ioss_ScanDesc->xs_want_itup = true;
|
|
|
|
node->ioss_VMBuffer = InvalidBuffer;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If no run-time keys to calculate or they are ready, go ahead and
|
|
|
|
* pass the scankeys to the index AM.
|
|
|
|
*/
|
|
|
|
if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
|
|
|
|
index_rescan(scandesc,
|
|
|
|
node->ioss_ScanKeys,
|
|
|
|
node->ioss_NumScanKeys,
|
|
|
|
node->ioss_OrderByKeys,
|
|
|
|
node->ioss_NumOrderByKeys);
|
|
|
|
}
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
/*
|
|
|
|
* OK, now that we have what we need, fetch the next tuple.
|
|
|
|
*/
|
|
|
|
while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
|
|
|
|
{
|
Fix serializable mode with index-only scans.
Serializable Snapshot Isolation used for serializable transactions
depends on acquiring SIRead locks on all heap relation tuples which
are used to generate the query result, so that a later delete or
update of any of the tuples can flag a read-write conflict between
transactions. This is normally handled in heapam.c, with tuple level
locking. Since an index-only scan avoids heap access in many cases,
building the result from the index tuple, the necessary predicate
locks were not being acquired for all tuples in an index-only scan.
To prevent problems with tuple IDs which are vacuumed and re-used
while the transaction still matters, the xmin of the tuple is part of
the tag for the tuple lock. Since xmin is not available to the
index-only scan for result rows generated from the index tuples, it
is not possible to acquire a tuple-level predicate lock in such
cases, in spite of having the tid. If we went to the heap to get the
xmin value, it would no longer be an index-only scan. Rather than
prohibit index-only scans under serializable transaction isolation,
we acquire an SIRead lock on the page containing the tuple, when it
was not necessary to visit the heap for other reasons.
Backpatch to 9.2.
Kevin Grittner and Tom Lane
2012-09-05 04:13:11 +02:00
|
|
|
HeapTuple tuple = NULL;
|
|
|
|
|
2017-07-26 02:37:17 +02:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
/*
|
|
|
|
* We can skip the heap fetch if the TID references a heap page on
|
|
|
|
* which all tuples are known visible to everybody. In any case,
|
|
|
|
* we'll use the index tuple not the heap tuple as the data source.
|
Fix more crash-safe visibility map bugs, and improve comments.
In lazy_scan_heap, we could issue bogus warnings about incorrect
information in the visibility map, because we checked the visibility
map bit before locking the heap page, creating a race condition. Fix
by rechecking the visibility map bit before we complain. Rejigger
some related logic so that we rely on the possibly-outdated
all_visible_according_to_vm value as little as possible.
In heap_multi_insert, it's not safe to clear the visibility map bit
before beginning the critical section. The visibility map is not
crash-safe unless we treat clearing the bit as a critical operation.
Specifically, if the transaction were to error out after we set the
bit and before entering the critical section, we could end up writing
the heap page to disk (with the bit cleared) and crashing before the
visibility map page made it to disk. That would be bad. heap_insert
has this correct, but somehow the order of operations got rearranged
when heap_multi_insert was added.
Also, add some more comments to visibilitymap_test, lazy_scan_heap,
and IndexOnlyNext, expounding on concurrency issues.
Per extensive code review by Andres Freund, and further review by Tom
Lane, who also made the original report about the bogus warnings.
2012-06-07 18:25:41 +02:00
|
|
|
*
|
Change the format of the VM fork to add a second bit per page.
The new bit indicates whether every tuple on the page is already frozen.
It is cleared only when the all-visible bit is cleared, and it can be
set only when we vacuum a page and find that every tuple on that page is
both visible to every transaction and in no need of any future
vacuuming.
A future commit will use this new bit to optimize away full-table scans
that would otherwise be triggered by XID wraparound considerations. A
page which is merely all-visible must still be scanned in that case, but
a page which is all-frozen need not be. This commit does not attempt
that optimization, although that optimization is the goal here. It
seems better to get the basic infrastructure in place first.
Per discussion, it's very desirable for pg_upgrade to automatically
migrate existing VM forks from the old format to the new format. That,
too, will be handled in a follow-on patch.
Masahiko Sawada, reviewed by Kyotaro Horiguchi, Fujii Masao, Amit
Kapila, Simon Riggs, Andres Freund, and others, and substantially
revised by me.
2016-03-02 03:49:41 +01:00
|
|
|
* Note on Memory Ordering Effects: visibilitymap_get_status does not
|
|
|
|
* lock the visibility map buffer, and therefore the result we read
|
|
|
|
* here could be slightly stale. However, it can't be stale enough to
|
2014-05-04 22:18:55 +02:00
|
|
|
* matter.
|
|
|
|
*
|
|
|
|
* We need to detect clearing a VM bit due to an insert right away,
|
|
|
|
* because the tuple is present in the index page but not visible. The
|
|
|
|
* reading of the TID by this scan (using a shared lock on the index
|
|
|
|
* buffer) is serialized with the insert of the TID into the index
|
|
|
|
* (using an exclusive lock on the index buffer). Because the VM bit
|
|
|
|
* is cleared before updating the index, and locking/unlocking of the
|
|
|
|
* index page acts as a full memory barrier, we are sure to see the
|
|
|
|
* cleared bit if we see a recently-inserted TID.
|
|
|
|
*
|
|
|
|
* Deletes do not update the index page (only VACUUM will clear out
|
|
|
|
* the TID), so the clearing of the VM bit by a delete is not
|
|
|
|
* serialized with this test below, and we may see a value that is
|
|
|
|
* significantly stale. However, we don't care about the delete right
|
|
|
|
* away, because the tuple is still visible until the deleting
|
|
|
|
* transaction commits or the statement ends (if it's our
|
|
|
|
* transaction). In either case, the lock on the VM buffer will have
|
2015-05-24 03:35:49 +02:00
|
|
|
* been released (acting as a write barrier) after clearing the bit.
|
|
|
|
* And for us to have a snapshot that includes the deleting
|
2014-05-04 22:18:55 +02:00
|
|
|
* transaction (making the tuple invisible), we must have acquired
|
|
|
|
* ProcArrayLock after that time, acting as a read barrier.
|
|
|
|
*
|
|
|
|
* It's worth going through this complexity to avoid needing to lock
|
|
|
|
* the VM buffer, which could cause significant contention.
|
2011-10-11 20:20:06 +02:00
|
|
|
*/
|
Change the format of the VM fork to add a second bit per page.
The new bit indicates whether every tuple on the page is already frozen.
It is cleared only when the all-visible bit is cleared, and it can be
set only when we vacuum a page and find that every tuple on that page is
both visible to every transaction and in no need of any future
vacuuming.
A future commit will use this new bit to optimize away full-table scans
that would otherwise be triggered by XID wraparound considerations. A
page which is merely all-visible must still be scanned in that case, but
a page which is all-frozen need not be. This commit does not attempt
that optimization, although that optimization is the goal here. It
seems better to get the basic infrastructure in place first.
Per discussion, it's very desirable for pg_upgrade to automatically
migrate existing VM forks from the old format to the new format. That,
too, will be handled in a follow-on patch.
Masahiko Sawada, reviewed by Kyotaro Horiguchi, Fujii Masao, Amit
Kapila, Simon Riggs, Andres Freund, and others, and substantially
revised by me.
2016-03-02 03:49:41 +01:00
|
|
|
if (!VM_ALL_VISIBLE(scandesc->heapRelation,
|
|
|
|
ItemPointerGetBlockNumber(tid),
|
|
|
|
&node->ioss_VMBuffer))
|
2011-10-11 20:20:06 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Rats, we have to visit the heap to check visibility.
|
|
|
|
*/
|
2018-04-10 20:56:15 +02:00
|
|
|
InstrCountTuples2(node, 1);
|
2011-10-11 20:20:06 +02:00
|
|
|
tuple = index_fetch_heap(scandesc);
|
|
|
|
if (tuple == NULL)
|
2012-06-10 21:20:04 +02:00
|
|
|
continue; /* no visible tuple, try next index entry */
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only MVCC snapshots are supported here, so there should be no
|
|
|
|
* need to keep following the HOT chain once a visible entry has
|
2014-05-06 18:12:18 +02:00
|
|
|
* been found. If we did want to allow that, we'd need to keep
|
2011-10-11 20:20:06 +02:00
|
|
|
* more state to remember not to call index_getnext_tid next time.
|
|
|
|
*/
|
|
|
|
if (scandesc->xs_continue_hot)
|
|
|
|
elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note: at this point we are holding a pin on the heap page, as
|
|
|
|
* recorded in scandesc->xs_cbuf. We could release that pin now,
|
2014-05-06 18:12:18 +02:00
|
|
|
* but it's not clear whether it's a win to do so. The next index
|
2011-10-11 20:20:06 +02:00
|
|
|
* entry might require a visit to the same heap page.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-02-27 23:20:34 +01:00
|
|
|
* Fill the scan tuple slot with data from the index. This might be
|
|
|
|
* provided in either HeapTuple or IndexTuple format. Conceivably an
|
|
|
|
* index AM might fill both fields, in which case we prefer the heap
|
|
|
|
* format, since it's probably a bit cheaper to fill a slot from.
|
2011-10-11 20:20:06 +02:00
|
|
|
*/
|
2017-02-27 23:20:34 +01:00
|
|
|
if (scandesc->xs_hitup)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We don't take the trouble to verify that the provided tuple has
|
|
|
|
* exactly the slot's format, but it seems worth doing a quick
|
|
|
|
* check on the number of fields.
|
|
|
|
*/
|
|
|
|
Assert(slot->tts_tupleDescriptor->natts ==
|
|
|
|
scandesc->xs_hitupdesc->natts);
|
2018-09-26 01:27:48 +02:00
|
|
|
ExecStoreHeapTuple(scandesc->xs_hitup, slot, false);
|
2017-02-27 23:20:34 +01:00
|
|
|
}
|
|
|
|
else if (scandesc->xs_itup)
|
|
|
|
StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc);
|
|
|
|
else
|
|
|
|
elog(ERROR, "no data returned for index-only scan");
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the index was lossy, we have to recheck the index quals.
|
|
|
|
* (Currently, this can never happen, but we should support the case
|
|
|
|
* for possible future use, eg with GiST indexes.)
|
|
|
|
*/
|
|
|
|
if (scandesc->xs_recheck)
|
|
|
|
{
|
|
|
|
econtext->ecxt_scantuple = slot;
|
2018-01-29 21:16:53 +01:00
|
|
|
if (!ExecQualAndReset(node->indexqual, econtext))
|
2011-10-11 20:20:06 +02:00
|
|
|
{
|
|
|
|
/* Fails recheck, so drop it and loop back for another */
|
|
|
|
InstrCountFiltered2(node, 1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-05-23 22:24:31 +02:00
|
|
|
/*
|
|
|
|
* We don't currently support rechecking ORDER BY distances. (In
|
|
|
|
* principle, if the index can support retrieval of the originally
|
|
|
|
* indexed value, it should be able to produce an exact distance
|
|
|
|
* calculation too. So it's not clear that adding code here for
|
|
|
|
* recheck/re-sort would be worth the trouble. But we should at least
|
|
|
|
* throw an error if someone tries it.)
|
|
|
|
*/
|
|
|
|
if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
|
|
errmsg("lossy distance functions are not supported in index-only scans")));
|
|
|
|
|
Fix serializable mode with index-only scans.
Serializable Snapshot Isolation used for serializable transactions
depends on acquiring SIRead locks on all heap relation tuples which
are used to generate the query result, so that a later delete or
update of any of the tuples can flag a read-write conflict between
transactions. This is normally handled in heapam.c, with tuple level
locking. Since an index-only scan avoids heap access in many cases,
building the result from the index tuple, the necessary predicate
locks were not being acquired for all tuples in an index-only scan.
To prevent problems with tuple IDs which are vacuumed and re-used
while the transaction still matters, the xmin of the tuple is part of
the tag for the tuple lock. Since xmin is not available to the
index-only scan for result rows generated from the index tuples, it
is not possible to acquire a tuple-level predicate lock in such
cases, in spite of having the tid. If we went to the heap to get the
xmin value, it would no longer be an index-only scan. Rather than
prohibit index-only scans under serializable transaction isolation,
we acquire an SIRead lock on the page containing the tuple, when it
was not necessary to visit the heap for other reasons.
Backpatch to 9.2.
Kevin Grittner and Tom Lane
2012-09-05 04:13:11 +02:00
|
|
|
/*
|
|
|
|
* Predicate locks for index-only scans must be acquired at the page
|
|
|
|
* level when the heap is not accessed, since tuple-level predicate
|
|
|
|
* locks need the tuple's xmin value. If we had to visit the tuple
|
|
|
|
* anyway, then we already have the tuple-level lock and can skip the
|
|
|
|
* page lock.
|
|
|
|
*/
|
|
|
|
if (tuple == NULL)
|
|
|
|
PredicateLockPage(scandesc->heapRelation,
|
|
|
|
ItemPointerGetBlockNumber(tid),
|
|
|
|
estate->es_snapshot);
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
return slot;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* if we get here it means the index scan failed so we are at the end of
|
|
|
|
* the scan..
|
|
|
|
*/
|
|
|
|
return ExecClearTuple(slot);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* StoreIndexTuple
|
|
|
|
* Fill the slot with data from the index tuple.
|
|
|
|
*
|
|
|
|
* At some point this might be generally-useful functionality, but
|
|
|
|
* right now we don't need it elsewhere.
|
|
|
|
*/
|
|
|
|
static void
|
2011-10-17 01:15:04 +02:00
|
|
|
StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
|
2011-10-11 20:20:06 +02:00
|
|
|
{
|
2011-10-17 01:15:04 +02:00
|
|
|
int nindexatts = itupdesc->natts;
|
2011-10-11 20:20:06 +02:00
|
|
|
Datum *values = slot->tts_values;
|
|
|
|
bool *isnull = slot->tts_isnull;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
2011-10-17 01:15:04 +02:00
|
|
|
* Note: we must use the tupdesc supplied by the AM in index_getattr, not
|
2011-10-12 00:11:51 +02:00
|
|
|
* the slot's tupdesc, in case the latter has different datatypes (this
|
|
|
|
* happens for btree name_ops in particular). They'd better have the same
|
2012-06-10 21:20:04 +02:00
|
|
|
* number of columns though, as well as being datatype-compatible which is
|
|
|
|
* something we can't so easily check.
|
2011-10-11 20:20:06 +02:00
|
|
|
*/
|
|
|
|
Assert(slot->tts_tupleDescriptor->natts == nindexatts);
|
|
|
|
|
|
|
|
ExecClearTuple(slot);
|
|
|
|
for (i = 0; i < nindexatts; i++)
|
2011-10-17 01:15:04 +02:00
|
|
|
values[i] = index_getattr(itup, i + 1, itupdesc, &isnull[i]);
|
2011-10-11 20:20:06 +02:00
|
|
|
ExecStoreVirtualTuple(slot);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
|
|
|
|
*
|
|
|
|
* This can't really happen, since an index can't supply CTID which would
|
|
|
|
* be necessary data for any potential EvalPlanQual target relation. If it
|
|
|
|
* did happen, the EPQ code would pass us the wrong data, namely a heap
|
|
|
|
* tuple not an index tuple. So throw an error.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
|
|
|
|
{
|
|
|
|
elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans");
|
|
|
|
return false; /* keep compiler quiet */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecIndexOnlyScan(node)
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
2017-07-17 09:33:49 +02:00
|
|
|
static TupleTableSlot *
|
|
|
|
ExecIndexOnlyScan(PlanState *pstate)
|
2011-10-11 20:20:06 +02:00
|
|
|
{
|
2017-07-17 09:33:49 +02:00
|
|
|
IndexOnlyScanState *node = castNode(IndexOnlyScanState, pstate);
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
/*
|
|
|
|
* If we have runtime keys and they've not already been set up, do it now.
|
|
|
|
*/
|
|
|
|
if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
|
|
|
|
ExecReScan((PlanState *) node);
|
|
|
|
|
|
|
|
return ExecScan(&node->ss,
|
|
|
|
(ExecScanAccessMtd) IndexOnlyNext,
|
|
|
|
(ExecScanRecheckMtd) IndexOnlyRecheck);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecReScanIndexOnlyScan(node)
|
|
|
|
*
|
|
|
|
* Recalculates the values of any scan keys whose value depends on
|
|
|
|
* information known at runtime, then rescans the indexed relation.
|
|
|
|
*
|
|
|
|
* Updating the scan key was formerly done separately in
|
|
|
|
* ExecUpdateIndexScanKeys. Integrating it into ReScan makes
|
|
|
|
* rescans of indices and relations/general streams more uniform.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If we are doing runtime key calculations (ie, any of the index key
|
|
|
|
* values weren't simple Consts), compute the new key values. But first,
|
|
|
|
* reset the context so we don't leak memory as each outer tuple is
|
|
|
|
* scanned. Note this assumes that we will recalculate *all* runtime keys
|
|
|
|
* on each call.
|
|
|
|
*/
|
|
|
|
if (node->ioss_NumRuntimeKeys != 0)
|
|
|
|
{
|
|
|
|
ExprContext *econtext = node->ioss_RuntimeContext;
|
|
|
|
|
|
|
|
ResetExprContext(econtext);
|
|
|
|
ExecIndexEvalRuntimeKeys(econtext,
|
|
|
|
node->ioss_RuntimeKeys,
|
|
|
|
node->ioss_NumRuntimeKeys);
|
|
|
|
}
|
|
|
|
node->ioss_RuntimeKeysReady = true;
|
|
|
|
|
|
|
|
/* reset index scan */
|
2017-02-19 11:23:59 +01:00
|
|
|
if (node->ioss_ScanDesc)
|
|
|
|
index_rescan(node->ioss_ScanDesc,
|
|
|
|
node->ioss_ScanKeys, node->ioss_NumScanKeys,
|
|
|
|
node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
ExecScanReScan(&node->ss);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecEndIndexOnlyScan
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecEndIndexOnlyScan(IndexOnlyScanState *node)
|
|
|
|
{
|
|
|
|
Relation indexRelationDesc;
|
|
|
|
IndexScanDesc indexScanDesc;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* extract information from the node
|
|
|
|
*/
|
|
|
|
indexRelationDesc = node->ioss_RelationDesc;
|
|
|
|
indexScanDesc = node->ioss_ScanDesc;
|
|
|
|
|
|
|
|
/* Release VM buffer pin, if any. */
|
|
|
|
if (node->ioss_VMBuffer != InvalidBuffer)
|
|
|
|
{
|
|
|
|
ReleaseBuffer(node->ioss_VMBuffer);
|
|
|
|
node->ioss_VMBuffer = InvalidBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free the exprcontext(s) ... now dead code, see ExecFreeExprContext
|
|
|
|
*/
|
|
|
|
#ifdef NOT_USED
|
|
|
|
ExecFreeExprContext(&node->ss.ps);
|
|
|
|
if (node->ioss_RuntimeContext)
|
|
|
|
FreeExprContext(node->ioss_RuntimeContext, true);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* clear out tuple table slots
|
|
|
|
*/
|
Don't require return slots for nodes without projection.
In a lot of nodes the return slot is not required. That can either be
because the node doesn't do any projection (say an Append node), or
because the node does perform projections but the projection is
optimized away because the projection would yield an identical row.
Slots aren't that small, especially for wide rows, so it's worthwhile
to avoid creating them. It's not possible to just skip creating the
slot - it's currently used to determine the tuple descriptor returned
by ExecGetResultType(). So separate the determination of the result
type from the slot creation. The work previously done internally
ExecInitResultTupleSlotTL() can now also be done separately with
ExecInitResultTypeTL() and ExecInitResultSlot(). That way nodes that
aren't guaranteed to need a result slot, can use
ExecInitResultTypeTL() to determine the result type of the node, and
ExecAssignScanProjectionInfo() (via
ExecConditionalAssignProjectionInfo()) determines that a result slot
is needed, it is created with ExecInitResultSlot().
Besides the advantage of avoiding to create slots that then are
unused, this is necessary preparation for later patches around tuple
table slot abstraction. In particular separating the return descriptor
and slot is a prerequisite to allow JITing of tuple deforming with
knowledge of the underlying tuple format, and to avoid unnecessarily
creating JITed tuple deforming for virtual slots.
This commit removes a redundant argument from
ExecInitResultTupleSlotTL(). While this commit touches a lot of the
relevant lines anyway, it'd normally still not worthwhile to cause
breakage, except that aforementioned later commits will touch *all*
ExecInitResultTupleSlotTL() callers anyway (but fits worse
thematically).
Author: Andres Freund
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-10 02:19:39 +01:00
|
|
|
if (node->ss.ps.ps_ResultTupleSlot)
|
|
|
|
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
|
2011-10-11 20:20:06 +02:00
|
|
|
ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* close the index relation (no-op if we didn't open it)
|
|
|
|
*/
|
|
|
|
if (indexScanDesc)
|
|
|
|
index_endscan(indexScanDesc);
|
|
|
|
if (indexRelationDesc)
|
|
|
|
index_close(indexRelationDesc, NoLock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecIndexOnlyMarkPos
|
Avoid crash during EvalPlanQual recheck of an inner indexscan.
Commit 09529a70b changed nodeIndexscan.c and nodeIndexonlyscan.c to
postpone initialization of the indexscan proper until the first tuple
fetch. It overlooked the question of mark/restore behavior, which means
that if some caller attempts to mark the scan before the first tuple fetch,
you get a null pointer dereference.
The only existing user of mark/restore is nodeMergejoin.c, which (somewhat
accidentally) will never attempt to set a mark before the first inner tuple
unless the inner child node is a Material node. Hence the case can't arise
normally, so it seems sufficient to document the assumption at both ends.
However, during an EvalPlanQual recheck, ExecScanFetch doesn't call
IndexNext but just returns the jammed-in test tuple. Therefore, if we're
doing a recheck in a plan tree with a mergejoin with inner indexscan,
it's possible to reach ExecIndexMarkPos with iss_ScanDesc still null,
as reported by Guo Xiang Tan in bug #15032.
Really, when there's a test tuple supplied during an EPQ recheck, touching
the index at all is the wrong thing: rather, the behavior of mark/restore
ought to amount to saving and restoring the es_epqScanDone flag. We can
avoid finding a place to actually save the flag, for the moment, because
given the assumption that no caller will set a mark before fetching a
tuple, es_epqScanDone must always be set by the time we try to mark.
So the actual behavior change required is just to not reach the index
access if a test tuple is supplied.
The set of plan node types that need to consider this issue are those
that support EPQ test tuples (i.e., call ExecScan()) and also support
mark/restore; which is to say, IndexScan, IndexOnlyScan, and perhaps
CustomScan. It's tempting to try to fix the problem in one place by
teaching ExecMarkPos() itself about EPQ; but ExecMarkPos supports some
plan types that aren't Scans, and also it seems risky to make assumptions
about what a CustomScan wants to do here. Also, the most likely future
change here is to decide that we do need to support marks placed before
the first tuple, which would require additional work in IndexScan and
IndexOnlyScan in any case. Hence, fix the EPQ issue in nodeIndexscan.c
and nodeIndexonlyscan.c, accepting the small amount of code duplicated
thereby, and leave it to CustomScan providers to fix this bug if they
have it.
Back-patch to v10 where commit 09529a70b came in. In earlier branches,
the index_markpos() call is a waste of cycles when EPQ is active, but
no more than that, so it doesn't seem appropriate to back-patch further.
Discussion: https://postgr.es/m/20180126074932.3098.97815@wrigleys.postgresql.org
2018-01-27 19:52:24 +01:00
|
|
|
*
|
|
|
|
* Note: we assume that no caller attempts to set a mark before having read
|
|
|
|
* at least one tuple. Otherwise, ioss_ScanDesc might still be NULL.
|
2011-10-11 20:20:06 +02:00
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
|
|
|
|
{
|
Avoid crash during EvalPlanQual recheck of an inner indexscan.
Commit 09529a70b changed nodeIndexscan.c and nodeIndexonlyscan.c to
postpone initialization of the indexscan proper until the first tuple
fetch. It overlooked the question of mark/restore behavior, which means
that if some caller attempts to mark the scan before the first tuple fetch,
you get a null pointer dereference.
The only existing user of mark/restore is nodeMergejoin.c, which (somewhat
accidentally) will never attempt to set a mark before the first inner tuple
unless the inner child node is a Material node. Hence the case can't arise
normally, so it seems sufficient to document the assumption at both ends.
However, during an EvalPlanQual recheck, ExecScanFetch doesn't call
IndexNext but just returns the jammed-in test tuple. Therefore, if we're
doing a recheck in a plan tree with a mergejoin with inner indexscan,
it's possible to reach ExecIndexMarkPos with iss_ScanDesc still null,
as reported by Guo Xiang Tan in bug #15032.
Really, when there's a test tuple supplied during an EPQ recheck, touching
the index at all is the wrong thing: rather, the behavior of mark/restore
ought to amount to saving and restoring the es_epqScanDone flag. We can
avoid finding a place to actually save the flag, for the moment, because
given the assumption that no caller will set a mark before fetching a
tuple, es_epqScanDone must always be set by the time we try to mark.
So the actual behavior change required is just to not reach the index
access if a test tuple is supplied.
The set of plan node types that need to consider this issue are those
that support EPQ test tuples (i.e., call ExecScan()) and also support
mark/restore; which is to say, IndexScan, IndexOnlyScan, and perhaps
CustomScan. It's tempting to try to fix the problem in one place by
teaching ExecMarkPos() itself about EPQ; but ExecMarkPos supports some
plan types that aren't Scans, and also it seems risky to make assumptions
about what a CustomScan wants to do here. Also, the most likely future
change here is to decide that we do need to support marks placed before
the first tuple, which would require additional work in IndexScan and
IndexOnlyScan in any case. Hence, fix the EPQ issue in nodeIndexscan.c
and nodeIndexonlyscan.c, accepting the small amount of code duplicated
thereby, and leave it to CustomScan providers to fix this bug if they
have it.
Back-patch to v10 where commit 09529a70b came in. In earlier branches,
the index_markpos() call is a waste of cycles when EPQ is active, but
no more than that, so it doesn't seem appropriate to back-patch further.
Discussion: https://postgr.es/m/20180126074932.3098.97815@wrigleys.postgresql.org
2018-01-27 19:52:24 +01:00
|
|
|
EState *estate = node->ss.ps.state;
|
|
|
|
|
|
|
|
if (estate->es_epqTuple != NULL)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We are inside an EvalPlanQual recheck. If a test tuple exists for
|
|
|
|
* this relation, then we shouldn't access the index at all. We would
|
|
|
|
* instead need to save, and later restore, the state of the
|
|
|
|
* es_epqScanDone flag, so that re-fetching the test tuple is
|
|
|
|
* possible. However, given the assumption that no caller sets a mark
|
|
|
|
* at the start of the scan, we can only get here with es_epqScanDone
|
|
|
|
* already set, and so no state need be saved.
|
|
|
|
*/
|
|
|
|
Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
|
|
|
|
|
|
|
|
Assert(scanrelid > 0);
|
|
|
|
if (estate->es_epqTupleSet[scanrelid - 1])
|
|
|
|
{
|
|
|
|
/* Verify the claim above */
|
|
|
|
if (!estate->es_epqScanDone[scanrelid - 1])
|
|
|
|
elog(ERROR, "unexpected ExecIndexOnlyMarkPos call in EPQ recheck");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
index_markpos(node->ioss_ScanDesc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecIndexOnlyRestrPos
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
|
|
|
|
{
|
Avoid crash during EvalPlanQual recheck of an inner indexscan.
Commit 09529a70b changed nodeIndexscan.c and nodeIndexonlyscan.c to
postpone initialization of the indexscan proper until the first tuple
fetch. It overlooked the question of mark/restore behavior, which means
that if some caller attempts to mark the scan before the first tuple fetch,
you get a null pointer dereference.
The only existing user of mark/restore is nodeMergejoin.c, which (somewhat
accidentally) will never attempt to set a mark before the first inner tuple
unless the inner child node is a Material node. Hence the case can't arise
normally, so it seems sufficient to document the assumption at both ends.
However, during an EvalPlanQual recheck, ExecScanFetch doesn't call
IndexNext but just returns the jammed-in test tuple. Therefore, if we're
doing a recheck in a plan tree with a mergejoin with inner indexscan,
it's possible to reach ExecIndexMarkPos with iss_ScanDesc still null,
as reported by Guo Xiang Tan in bug #15032.
Really, when there's a test tuple supplied during an EPQ recheck, touching
the index at all is the wrong thing: rather, the behavior of mark/restore
ought to amount to saving and restoring the es_epqScanDone flag. We can
avoid finding a place to actually save the flag, for the moment, because
given the assumption that no caller will set a mark before fetching a
tuple, es_epqScanDone must always be set by the time we try to mark.
So the actual behavior change required is just to not reach the index
access if a test tuple is supplied.
The set of plan node types that need to consider this issue are those
that support EPQ test tuples (i.e., call ExecScan()) and also support
mark/restore; which is to say, IndexScan, IndexOnlyScan, and perhaps
CustomScan. It's tempting to try to fix the problem in one place by
teaching ExecMarkPos() itself about EPQ; but ExecMarkPos supports some
plan types that aren't Scans, and also it seems risky to make assumptions
about what a CustomScan wants to do here. Also, the most likely future
change here is to decide that we do need to support marks placed before
the first tuple, which would require additional work in IndexScan and
IndexOnlyScan in any case. Hence, fix the EPQ issue in nodeIndexscan.c
and nodeIndexonlyscan.c, accepting the small amount of code duplicated
thereby, and leave it to CustomScan providers to fix this bug if they
have it.
Back-patch to v10 where commit 09529a70b came in. In earlier branches,
the index_markpos() call is a waste of cycles when EPQ is active, but
no more than that, so it doesn't seem appropriate to back-patch further.
Discussion: https://postgr.es/m/20180126074932.3098.97815@wrigleys.postgresql.org
2018-01-27 19:52:24 +01:00
|
|
|
EState *estate = node->ss.ps.state;
|
|
|
|
|
|
|
|
if (estate->es_epqTuple != NULL)
|
|
|
|
{
|
|
|
|
/* See comments in ExecIndexOnlyMarkPos */
|
|
|
|
Index scanrelid = ((Scan *) node->ss.ps.plan)->scanrelid;
|
|
|
|
|
|
|
|
Assert(scanrelid > 0);
|
|
|
|
if (estate->es_epqTupleSet[scanrelid - 1])
|
|
|
|
{
|
|
|
|
/* Verify the claim above */
|
|
|
|
if (!estate->es_epqScanDone[scanrelid - 1])
|
|
|
|
elog(ERROR, "unexpected ExecIndexOnlyRestrPos call in EPQ recheck");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
index_restrpos(node->ioss_ScanDesc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecInitIndexOnlyScan
|
|
|
|
*
|
|
|
|
* Initializes the index scan's state information, creates
|
|
|
|
* scan keys, and opens the base and index relations.
|
|
|
|
*
|
|
|
|
* Note: index scans have 2 sets of state information because
|
|
|
|
* we have to keep track of the base relation and the
|
|
|
|
* index relation.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
IndexOnlyScanState *
|
|
|
|
ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
|
|
|
|
{
|
|
|
|
IndexOnlyScanState *indexstate;
|
|
|
|
Relation currentRelation;
|
|
|
|
bool relistarget;
|
|
|
|
TupleDesc tupDesc;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* create state structure
|
|
|
|
*/
|
|
|
|
indexstate = makeNode(IndexOnlyScanState);
|
|
|
|
indexstate->ss.ps.plan = (Plan *) node;
|
|
|
|
indexstate->ss.ps.state = estate;
|
2017-07-17 09:33:49 +02:00
|
|
|
indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Miscellaneous initialization
|
|
|
|
*
|
|
|
|
* create expression context for node
|
|
|
|
*/
|
|
|
|
ExecAssignExprContext(estate, &indexstate->ss.ps);
|
|
|
|
|
|
|
|
/*
|
2018-10-06 21:49:37 +02:00
|
|
|
* open the scan relation
|
2011-10-11 20:20:06 +02:00
|
|
|
*/
|
2013-04-27 23:48:57 +02:00
|
|
|
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
indexstate->ss.ss_currentRelation = currentRelation;
|
|
|
|
indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */
|
|
|
|
|
|
|
|
/*
|
2011-10-12 00:11:51 +02:00
|
|
|
* Build the scan tuple type using the indextlist generated by the
|
|
|
|
* planner. We use this, rather than the index's physical tuple
|
|
|
|
* descriptor, because the latter contains storage column types not the
|
|
|
|
* types of the original datums. (It's the AM's responsibility to return
|
|
|
|
* suitable data anyway.)
|
|
|
|
*/
|
Remove WITH OIDS support, change oid catalog column visibility.
Previously tables declared WITH OIDS, including a significant fraction
of the catalog tables, stored the oid column not as a normal column,
but as part of the tuple header.
This special column was not shown by default, which was somewhat odd,
as it's often (consider e.g. pg_class.oid) one of the more important
parts of a row. Neither pg_dump nor COPY included the contents of the
oid column by default.
The fact that the oid column was not an ordinary column necessitated a
significant amount of special case code to support oid columns. That
already was painful for the existing, but upcoming work aiming to make
table storage pluggable, would have required expanding and duplicating
that "specialness" significantly.
WITH OIDS has been deprecated since 2005 (commit ff02d0a05280e0).
Remove it.
Removing includes:
- CREATE TABLE and ALTER TABLE syntax for declaring the table to be
WITH OIDS has been removed (WITH (oids[ = true]) will error out)
- pg_dump does not support dumping tables declared WITH OIDS and will
issue a warning when dumping one (and ignore the oid column).
- restoring an pg_dump archive with pg_restore will warn when
restoring a table with oid contents (and ignore the oid column)
- COPY will refuse to load binary dump that includes oids.
- pg_upgrade will error out when encountering tables declared WITH
OIDS, they have to be altered to remove the oid column first.
- Functionality to access the oid of the last inserted row (like
plpgsql's RESULT_OID, spi's SPI_lastoid, ...) has been removed.
The syntax for declaring a table WITHOUT OIDS (or WITH (oids = false)
for CREATE TABLE) is still supported. While that requires a bit of
support code, it seems unnecessary to break applications / dumps that
do not use oids, and are explicit about not using them.
The biggest user of WITH OID columns was postgres' catalog. This
commit changes all 'magic' oid columns to be columns that are normally
declared and stored. To reduce unnecessary query breakage all the
newly added columns are still named 'oid', even if a table's column
naming scheme would indicate 'reloid' or such. This obviously
requires adapting a lot code, mostly replacing oid access via
HeapTupleGetOid() with access to the underlying Form_pg_*->oid column.
The bootstrap process now assigns oids for all oid columns in
genbki.pl that do not have an explicit value (starting at the largest
oid previously used), only oids assigned later by oids will be above
FirstBootstrapObjectId. As the oid column now is a normal column the
special bootstrap syntax for oids has been removed.
Oids are not automatically assigned during insertion anymore, all
backend code explicitly assigns oids with GetNewOidWithIndex(). For
the rare case that insertions into the catalog via SQL are called for
the new pg_nextoid() function can be used (which only works on catalog
tables).
The fact that oid columns on system tables are now normal columns
means that they will be included in the set of columns expanded
by * (i.e. SELECT * FROM pg_class will now include the table's oid,
previously it did not). It'd not technically be hard to hide oid
column by default, but that'd mean confusing behavior would either
have to be carried forward forever, or it'd cause breakage down the
line.
While it's not unlikely that further adjustments are needed, the
scope/invasiveness of the patch makes it worthwhile to get merge this
now. It's painful to maintain externally, too complicated to commit
after the code code freeze, and a dependency of a number of other
patches.
Catversion bump, for obvious reasons.
Author: Andres Freund, with contributions by John Naylor
Discussion: https://postgr.es/m/20180930034810.ywp2c7awz7opzcfr@alap3.anarazel.de
2018-11-21 00:36:57 +01:00
|
|
|
tupDesc = ExecTypeFromTL(node->indextlist);
|
Introduce notion of different types of slots (without implementing them).
Upcoming work intends to allow pluggable ways to introduce new ways of
storing table data. Accessing those table access methods from the
executor requires TupleTableSlots to be carry tuples in the native
format of such storage methods; otherwise there'll be a significant
conversion overhead.
Different access methods will require different data to store tuples
efficiently (just like virtual, minimal, heap already require fields
in TupleTableSlot). To allow that without requiring additional pointer
indirections, we want to have different structs (embedding
TupleTableSlot) for different types of slots. Thus different types of
slots are needed, which requires adapting creators of slots.
The slot that most efficiently can represent a type of tuple in an
executor node will often depend on the type of slot a child node
uses. Therefore we need to track the type of slot is returned by
nodes, so parent slots can create slots based on that.
Relatedly, JIT compilation of tuple deforming needs to know which type
of slot a certain expression refers to, so it can create an
appropriate deforming function for the type of tuple in the slot.
But not all nodes will only return one type of slot, e.g. an append
node will potentially return different types of slots for each of its
subplans.
Therefore add function that allows to query the type of a node's
result slot, and whether it'll always be the same type (whether it's
fixed). This can be queried using ExecGetResultSlotOps().
The scan, result, inner, outer type of slots are automatically
inferred from ExecInitScanTupleSlot(), ExecInitResultSlot(),
left/right subtrees respectively. If that's not correct for a node,
that can be overwritten using new fields in PlanState.
This commit does not introduce the actually abstracted implementation
of different kind of TupleTableSlots, that will be left for a followup
commit. The different types of slots introduced will, for now, still
use the same backing implementation.
While this already partially invalidates the big comment in
tuptable.h, it seems to make more sense to update it later, when the
different TupleTableSlot implementations actually exist.
Author: Ashutosh Bapat and Andres Freund, with changes by Amit Khandekar
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-16 07:00:30 +01:00
|
|
|
ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc, &TTSOpsHeapTuple);
|
2011-10-12 00:11:51 +02:00
|
|
|
|
|
|
|
/*
|
Don't require return slots for nodes without projection.
In a lot of nodes the return slot is not required. That can either be
because the node doesn't do any projection (say an Append node), or
because the node does perform projections but the projection is
optimized away because the projection would yield an identical row.
Slots aren't that small, especially for wide rows, so it's worthwhile
to avoid creating them. It's not possible to just skip creating the
slot - it's currently used to determine the tuple descriptor returned
by ExecGetResultType(). So separate the determination of the result
type from the slot creation. The work previously done internally
ExecInitResultTupleSlotTL() can now also be done separately with
ExecInitResultTypeTL() and ExecInitResultSlot(). That way nodes that
aren't guaranteed to need a result slot, can use
ExecInitResultTypeTL() to determine the result type of the node, and
ExecAssignScanProjectionInfo() (via
ExecConditionalAssignProjectionInfo()) determines that a result slot
is needed, it is created with ExecInitResultSlot().
Besides the advantage of avoiding to create slots that then are
unused, this is necessary preparation for later patches around tuple
table slot abstraction. In particular separating the return descriptor
and slot is a prerequisite to allow JITing of tuple deforming with
knowledge of the underlying tuple format, and to avoid unnecessarily
creating JITed tuple deforming for virtual slots.
This commit removes a redundant argument from
ExecInitResultTupleSlotTL(). While this commit touches a lot of the
relevant lines anyway, it'd normally still not worthwhile to cause
breakage, except that aforementioned later commits will touch *all*
ExecInitResultTupleSlotTL() callers anyway (but fits worse
thematically).
Author: Andres Freund
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-10 02:19:39 +01:00
|
|
|
* Initialize result type and projection info. The node's targetlist will
|
|
|
|
* contain Vars with varno = INDEX_VAR, referencing the scan tuple.
|
2011-10-11 20:20:06 +02:00
|
|
|
*/
|
Don't require return slots for nodes without projection.
In a lot of nodes the return slot is not required. That can either be
because the node doesn't do any projection (say an Append node), or
because the node does perform projections but the projection is
optimized away because the projection would yield an identical row.
Slots aren't that small, especially for wide rows, so it's worthwhile
to avoid creating them. It's not possible to just skip creating the
slot - it's currently used to determine the tuple descriptor returned
by ExecGetResultType(). So separate the determination of the result
type from the slot creation. The work previously done internally
ExecInitResultTupleSlotTL() can now also be done separately with
ExecInitResultTypeTL() and ExecInitResultSlot(). That way nodes that
aren't guaranteed to need a result slot, can use
ExecInitResultTypeTL() to determine the result type of the node, and
ExecAssignScanProjectionInfo() (via
ExecConditionalAssignProjectionInfo()) determines that a result slot
is needed, it is created with ExecInitResultSlot().
Besides the advantage of avoiding to create slots that then are
unused, this is necessary preparation for later patches around tuple
table slot abstraction. In particular separating the return descriptor
and slot is a prerequisite to allow JITing of tuple deforming with
knowledge of the underlying tuple format, and to avoid unnecessarily
creating JITed tuple deforming for virtual slots.
This commit removes a redundant argument from
ExecInitResultTupleSlotTL(). While this commit touches a lot of the
relevant lines anyway, it'd normally still not worthwhile to cause
breakage, except that aforementioned later commits will touch *all*
ExecInitResultTupleSlotTL() callers anyway (but fits worse
thematically).
Author: Andres Freund
Discussion: https://postgr.es/m/20181105210039.hh4vvi4vwoq5ba2q@alap3.anarazel.de
2018-11-10 02:19:39 +01:00
|
|
|
ExecInitResultTypeTL(&indexstate->ss.ps);
|
Code review for foreign/custom join pushdown patch.
Commit e7cb7ee14555cc9c5773e2c102efd6371f6f2005 included some design
decisions that seem pretty questionable to me, and there was quite a lot
of stuff not to like about the documentation and comments. Clean up
as follows:
* Consider foreign joins only between foreign tables on the same server,
rather than between any two foreign tables with the same underlying FDW
handler function. In most if not all cases, the FDW would simply have had
to apply the same-server restriction itself (far more expensively, both for
lack of caching and because it would be repeated for each combination of
input sub-joins), or else risk nasty bugs. Anyone who's really intent on
doing something outside this restriction can always use the
set_join_pathlist_hook.
* Rename fdw_ps_tlist/custom_ps_tlist to fdw_scan_tlist/custom_scan_tlist
to better reflect what they're for, and allow these custom scan tlists
to be used even for base relations.
* Change make_foreignscan() API to include passing the fdw_scan_tlist
value, since the FDW is required to set that. Backwards compatibility
doesn't seem like an adequate reason to expect FDWs to set it in some
ad-hoc extra step, and anyway existing FDWs can just pass NIL.
* Change the API of path-generating subroutines of add_paths_to_joinrel,
and in particular that of GetForeignJoinPaths and set_join_pathlist_hook,
so that various less-used parameters are passed in a struct rather than
as separate parameter-list entries. The objective here is to reduce the
probability that future additions to those parameter lists will result in
source-level API breaks for users of these hooks. It's possible that this
is even a small win for the core code, since most CPU architectures can't
pass more than half a dozen parameters efficiently anyway. I kept root,
joinrel, outerrel, innerrel, and jointype as separate parameters to reduce
code churn in joinpath.c --- in particular, putting jointype into the
struct would have been problematic because of the subroutines' habit of
changing their local copies of that variable.
* Avoid ad-hocery in ExecAssignScanProjectionInfo. It was probably all
right for it to know about IndexOnlyScan, but if the list is to grow
we should refactor the knowledge out to the callers.
* Restore nodeForeignscan.c's previous use of the relcache to avoid
extra GetFdwRoutine lookups for base-relation scans.
* Lots of cleanup of documentation and missed comments. Re-order some
code additions into more logical places.
2015-05-10 20:36:30 +02:00
|
|
|
ExecAssignScanProjectionInfoWithVarno(&indexstate->ss, INDEX_VAR);
|
2011-10-11 20:20:06 +02:00
|
|
|
|
2018-02-17 06:17:38 +01:00
|
|
|
/*
|
|
|
|
* initialize child expressions
|
|
|
|
*
|
|
|
|
* Note: we don't initialize all of the indexorderby expression, only the
|
|
|
|
* sub-parts corresponding to runtime keys (see below).
|
|
|
|
*/
|
|
|
|
indexstate->ss.ps.qual =
|
|
|
|
ExecInitQual(node->scan.plan.qual, (PlanState *) indexstate);
|
|
|
|
indexstate->indexqual =
|
|
|
|
ExecInitQual(node->indexqual, (PlanState *) indexstate);
|
|
|
|
|
2011-10-11 20:20:06 +02:00
|
|
|
/*
|
|
|
|
* If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
|
|
|
|
* here. This allows an index-advisor plugin to EXPLAIN a plan containing
|
|
|
|
* references to nonexistent indexes.
|
|
|
|
*/
|
|
|
|
if (eflags & EXEC_FLAG_EXPLAIN_ONLY)
|
|
|
|
return indexstate;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open the index relation.
|
|
|
|
*
|
|
|
|
* If the parent table is one of the target relations of the query, then
|
|
|
|
* InitPlan already opened and write-locked the index, so we can avoid
|
|
|
|
* taking another lock here. Otherwise we need a normal reader's lock.
|
|
|
|
*/
|
|
|
|
relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid);
|
|
|
|
indexstate->ioss_RelationDesc = index_open(node->indexid,
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 21:35:54 +02:00
|
|
|
relistarget ? NoLock : AccessShareLock);
|
2011-10-11 20:20:06 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize index-specific scan state
|
|
|
|
*/
|
|
|
|
indexstate->ioss_RuntimeKeysReady = false;
|
|
|
|
indexstate->ioss_RuntimeKeys = NULL;
|
|
|
|
indexstate->ioss_NumRuntimeKeys = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* build the index scan keys from the index qualification
|
|
|
|
*/
|
|
|
|
ExecIndexBuildScanKeys((PlanState *) indexstate,
|
|
|
|
indexstate->ioss_RelationDesc,
|
|
|
|
node->indexqual,
|
|
|
|
false,
|
|
|
|
&indexstate->ioss_ScanKeys,
|
|
|
|
&indexstate->ioss_NumScanKeys,
|
|
|
|
&indexstate->ioss_RuntimeKeys,
|
|
|
|
&indexstate->ioss_NumRuntimeKeys,
|
|
|
|
NULL, /* no ArrayKeys */
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* any ORDER BY exprs have to be turned into scankeys in the same way
|
|
|
|
*/
|
|
|
|
ExecIndexBuildScanKeys((PlanState *) indexstate,
|
|
|
|
indexstate->ioss_RelationDesc,
|
|
|
|
node->indexorderby,
|
|
|
|
true,
|
|
|
|
&indexstate->ioss_OrderByKeys,
|
|
|
|
&indexstate->ioss_NumOrderByKeys,
|
|
|
|
&indexstate->ioss_RuntimeKeys,
|
|
|
|
&indexstate->ioss_NumRuntimeKeys,
|
|
|
|
NULL, /* no ArrayKeys */
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have runtime keys, we need an ExprContext to evaluate them. The
|
|
|
|
* node's standard context won't do because we want to reset that context
|
|
|
|
* for every tuple. So, build another context just like the other one...
|
|
|
|
* -tgl 7/11/00
|
|
|
|
*/
|
|
|
|
if (indexstate->ioss_NumRuntimeKeys != 0)
|
|
|
|
{
|
|
|
|
ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext;
|
|
|
|
|
|
|
|
ExecAssignExprContext(estate, &indexstate->ss.ps);
|
|
|
|
indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext;
|
|
|
|
indexstate->ss.ps.ps_ExprContext = stdecontext;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
indexstate->ioss_RuntimeContext = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* all done.
|
|
|
|
*/
|
|
|
|
return indexstate;
|
|
|
|
}
|
2017-02-19 11:23:59 +01:00
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* Parallel Index-only Scan Support
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecIndexOnlyScanEstimate
|
|
|
|
*
|
2017-10-28 11:50:22 +02:00
|
|
|
* Compute the amount of space we'll need in the parallel
|
|
|
|
* query DSM, and inform pcxt->estimator about our needs.
|
2017-02-19 11:23:59 +01:00
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
|
|
|
|
ParallelContext *pcxt)
|
|
|
|
{
|
|
|
|
EState *estate = node->ss.ps.state;
|
|
|
|
|
|
|
|
node->ioss_PscanLen = index_parallelscan_estimate(node->ioss_RelationDesc,
|
|
|
|
estate->es_snapshot);
|
|
|
|
shm_toc_estimate_chunk(&pcxt->estimator, node->ioss_PscanLen);
|
|
|
|
shm_toc_estimate_keys(&pcxt->estimator, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecIndexOnlyScanInitializeDSM
|
|
|
|
*
|
|
|
|
* Set up a parallel index-only scan descriptor.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
|
|
|
|
ParallelContext *pcxt)
|
|
|
|
{
|
|
|
|
EState *estate = node->ss.ps.state;
|
|
|
|
ParallelIndexScanDesc piscan;
|
|
|
|
|
|
|
|
piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen);
|
|
|
|
index_parallelscan_initialize(node->ss.ss_currentRelation,
|
|
|
|
node->ioss_RelationDesc,
|
|
|
|
estate->es_snapshot,
|
|
|
|
piscan);
|
|
|
|
shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, piscan);
|
|
|
|
node->ioss_ScanDesc =
|
|
|
|
index_beginscan_parallel(node->ss.ss_currentRelation,
|
|
|
|
node->ioss_RelationDesc,
|
|
|
|
node->ioss_NumScanKeys,
|
|
|
|
node->ioss_NumOrderByKeys,
|
|
|
|
piscan);
|
|
|
|
node->ioss_ScanDesc->xs_want_itup = true;
|
|
|
|
node->ioss_VMBuffer = InvalidBuffer;
|
|
|
|
|
|
|
|
/*
|
2017-03-08 14:15:24 +01:00
|
|
|
* If no run-time keys to calculate or they are ready, go ahead and pass
|
|
|
|
* the scankeys to the index AM.
|
2017-02-19 11:23:59 +01:00
|
|
|
*/
|
2017-03-08 14:15:24 +01:00
|
|
|
if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
|
2017-02-19 11:23:59 +01:00
|
|
|
index_rescan(node->ioss_ScanDesc,
|
|
|
|
node->ioss_ScanKeys, node->ioss_NumScanKeys,
|
|
|
|
node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
|
|
|
|
}
|
|
|
|
|
Separate reinitialization of shared parallel-scan state from ExecReScan.
Previously, the parallel executor logic did reinitialization of shared
state within the ExecReScan code for parallel-aware scan nodes. This is
problematic, because it means that the ExecReScan call has to occur
synchronously (ie, during the parent Gather node's ReScan call). That is
swimming very much against the tide so far as the ExecReScan machinery is
concerned; the fact that it works at all today depends on a lot of fragile
assumptions, such as that no plan node between Gather and a parallel-aware
scan node is parameterized. Another objection is that because ExecReScan
might be called in workers as well as the leader, hacky extra tests are
needed in some places to prevent unwanted shared-state resets.
Hence, let's separate this code into two functions, a ReInitializeDSM
call and the ReScan call proper. ReInitializeDSM is called only in
the leader and is guaranteed to run before we start new workers.
ReScan is returned to its traditional function of resetting only local
state, which means that ExecReScan's usual habits of delaying or
eliminating child rescan calls are safe again.
As with the preceding commit 7df2c1f8d, it doesn't seem to be necessary
to make these changes in 9.6, which is a good thing because the FDW and
CustomScan APIs are impacted.
Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
2017-08-30 19:18:16 +02:00
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecIndexOnlyScanReInitializeDSM
|
|
|
|
*
|
|
|
|
* Reset shared state before beginning a fresh scan.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node,
|
|
|
|
ParallelContext *pcxt)
|
|
|
|
{
|
|
|
|
index_parallelrescan(node->ioss_ScanDesc);
|
|
|
|
}
|
|
|
|
|
2017-02-19 11:23:59 +01:00
|
|
|
/* ----------------------------------------------------------------
|
|
|
|
* ExecIndexOnlyScanInitializeWorker
|
|
|
|
*
|
|
|
|
* Copy relevant information from TOC into planstate.
|
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
void
|
2017-11-17 02:28:11 +01:00
|
|
|
ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
|
|
|
|
ParallelWorkerContext *pwcxt)
|
2017-02-19 11:23:59 +01:00
|
|
|
{
|
|
|
|
ParallelIndexScanDesc piscan;
|
|
|
|
|
2017-11-17 02:28:11 +01:00
|
|
|
piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
|
2017-02-19 11:23:59 +01:00
|
|
|
node->ioss_ScanDesc =
|
|
|
|
index_beginscan_parallel(node->ss.ss_currentRelation,
|
|
|
|
node->ioss_RelationDesc,
|
|
|
|
node->ioss_NumScanKeys,
|
|
|
|
node->ioss_NumOrderByKeys,
|
|
|
|
piscan);
|
|
|
|
node->ioss_ScanDesc->xs_want_itup = true;
|
|
|
|
|
|
|
|
/*
|
2017-03-08 14:15:24 +01:00
|
|
|
* If no run-time keys to calculate or they are ready, go ahead and pass
|
|
|
|
* the scankeys to the index AM.
|
2017-02-19 11:23:59 +01:00
|
|
|
*/
|
2017-03-08 14:15:24 +01:00
|
|
|
if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady)
|
2017-02-19 11:23:59 +01:00
|
|
|
index_rescan(node->ioss_ScanDesc,
|
|
|
|
node->ioss_ScanKeys, node->ioss_NumScanKeys,
|
|
|
|
node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
|
|
|
|
}
|