1996-08-26 22:02:12 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* gistget.c
|
1996-08-26 22:02:12 +02:00
|
|
|
* fetch tuples from a GiST scan.
|
|
|
|
*
|
|
|
|
*
|
2021-01-02 19:06:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
2001-05-30 21:53:40 +02:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-08-26 22:02:12 +02:00
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/gist/gistget.c
|
1996-08-26 22:02:12 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1996-10-31 09:09:47 +01:00
|
|
|
|
2019-12-27 00:09:00 +01:00
|
|
|
#include "access/genam.h"
|
2005-06-14 13:45:14 +02:00
|
|
|
#include "access/gist_private.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "access/relscan.h"
|
2019-11-12 04:00:16 +01:00
|
|
|
#include "lib/pairingheap.h"
|
2008-04-11 00:25:26 +02:00
|
|
|
#include "miscadmin.h"
|
2019-11-12 04:00:16 +01:00
|
|
|
#include "pgstat.h"
|
2018-03-27 14:43:19 +02:00
|
|
|
#include "storage/lmgr.h"
|
|
|
|
#include "storage/predicate.h"
|
2018-07-29 03:30:48 +02:00
|
|
|
#include "utils/float.h"
|
2005-05-17 02:59:30 +02:00
|
|
|
#include "utils/memutils.h"
|
2011-02-23 18:18:09 +01:00
|
|
|
#include "utils/rel.h"
|
1996-10-21 07:11:00 +02:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
/*
|
|
|
|
* gistkillitems() -- set LP_DEAD state for items an indexscan caller has
|
|
|
|
* told us were killed.
|
|
|
|
*
|
|
|
|
* We re-read page here, so it's important to check page LSN. If the page
|
|
|
|
* has been modified since the last read (as determined by LSN), we cannot
|
|
|
|
* flag any entries because it is possible that the old entry was vacuumed
|
|
|
|
* away and the TID was re-used by a completely different heap tuple.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
gistkillitems(IndexScanDesc scan)
|
|
|
|
{
|
|
|
|
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
|
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
OffsetNumber offnum;
|
|
|
|
ItemId iid;
|
|
|
|
int i;
|
|
|
|
bool killedsomething = false;
|
|
|
|
|
|
|
|
Assert(so->curBlkno != InvalidBlockNumber);
|
|
|
|
Assert(!XLogRecPtrIsInvalid(so->curPageLSN));
|
|
|
|
Assert(so->killedItems != NULL);
|
|
|
|
|
|
|
|
buffer = ReadBuffer(scan->indexRelation, so->curBlkno);
|
|
|
|
if (!BufferIsValid(buffer))
|
|
|
|
return;
|
|
|
|
|
|
|
|
LockBuffer(buffer, GIST_SHARE);
|
|
|
|
gistcheckpage(scan->indexRelation, buffer);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buffer);
|
2015-09-09 17:43:37 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If page LSN differs it means that the page was modified since the last
|
|
|
|
* read. killedItems could be not valid so LP_DEAD hints applying is not
|
|
|
|
* safe.
|
|
|
|
*/
|
2018-01-09 19:54:39 +01:00
|
|
|
if (BufferGetLSNAtomic(buffer) != so->curPageLSN)
|
2015-09-09 17:43:37 +02:00
|
|
|
{
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
so->numKilled = 0; /* reset counter */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Assert(GistPageIsLeaf(page));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark all killedItems as dead. We need no additional recheck, because,
|
2019-08-05 05:14:58 +02:00
|
|
|
* if page was modified, curPageLSN must have changed.
|
2015-09-09 17:43:37 +02:00
|
|
|
*/
|
|
|
|
for (i = 0; i < so->numKilled; i++)
|
|
|
|
{
|
|
|
|
offnum = so->killedItems[i];
|
|
|
|
iid = PageGetItemId(page, offnum);
|
|
|
|
ItemIdMarkDead(iid);
|
|
|
|
killedsomething = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (killedsomething)
|
|
|
|
{
|
|
|
|
GistMarkPageHasGarbage(page);
|
|
|
|
MarkBufferDirtyHint(buffer, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Always reset the scan state, so we don't look for same items on other
|
|
|
|
* pages.
|
|
|
|
*/
|
|
|
|
so->numKilled = 0;
|
|
|
|
}
|
2005-10-06 04:29:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/*
|
|
|
|
* gistindex_keytest() -- does this index tuple satisfy the scan key(s)?
|
|
|
|
*
|
|
|
|
* The index tuple might represent either a heap tuple or a lower index page,
|
|
|
|
* depending on whether the containing page is a leaf page or not.
|
|
|
|
*
|
2015-05-15 13:26:51 +02:00
|
|
|
* On success return for a heap tuple, *recheck_p is set to indicate whether
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
* the quals need to be rechecked. We recheck if any of the consistent()
|
2015-05-15 13:26:51 +02:00
|
|
|
* functions request it. recheck is not interesting when examining a non-leaf
|
|
|
|
* entry, since we must visit the lower index page if there's any doubt.
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
* Similarly, *recheck_distances_p is set to indicate whether the distances
|
|
|
|
* need to be rechecked, and it is also ignored for non-leaf entries.
|
2010-12-04 02:52:18 +01:00
|
|
|
*
|
2019-09-19 20:30:19 +02:00
|
|
|
* If we are doing an ordered scan, so->distances[] is filled with distance
|
|
|
|
* data from the distance() functions before returning success.
|
2010-12-04 02:52:18 +01:00
|
|
|
*
|
|
|
|
* We must decompress the key in the IndexTuple before passing it to the
|
|
|
|
* sk_funcs (which actually are the opclass Consistent or Distance methods).
|
|
|
|
*
|
|
|
|
* Note that this function is always invoked in a short-lived memory context,
|
|
|
|
* so we don't need to worry about cleaning up allocated memory, either here
|
|
|
|
* or in the implementation of any Consistent or Distance methods.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
gistindex_keytest(IndexScanDesc scan,
|
|
|
|
IndexTuple tuple,
|
|
|
|
Page page,
|
|
|
|
OffsetNumber offset,
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
bool *recheck_p,
|
|
|
|
bool *recheck_distances_p)
|
2005-09-22 22:44:36 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
|
|
|
|
GISTSTATE *giststate = so->giststate;
|
|
|
|
ScanKey key = scan->keyData;
|
|
|
|
int keySize = scan->numberOfKeys;
|
2019-09-19 20:30:19 +02:00
|
|
|
IndexOrderByDistance *distance_p;
|
2010-12-04 02:52:18 +01:00
|
|
|
Relation r = scan->indexRelation;
|
2005-09-22 22:44:36 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
*recheck_p = false;
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
*recheck_distances_p = false;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/*
|
|
|
|
* If it's a leftover invalid tuple from pre-9.1, treat it as a match with
|
|
|
|
* minimum possible distances. This means we'll always follow it to the
|
|
|
|
* referenced page.
|
|
|
|
*/
|
|
|
|
if (GistTupleIsInvalid(tuple))
|
2008-08-23 12:37:24 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
int i;
|
|
|
|
|
2010-12-04 19:47:08 +01:00
|
|
|
if (GistPageIsLeaf(page)) /* shouldn't happen */
|
2011-05-19 00:14:45 +02:00
|
|
|
elog(ERROR, "invalid GiST tuple found on leaf page");
|
2010-12-04 02:52:18 +01:00
|
|
|
for (i = 0; i < scan->numberOfOrderBys; i++)
|
2019-09-08 20:13:40 +02:00
|
|
|
{
|
2019-09-19 20:30:19 +02:00
|
|
|
so->distances[i].value = -get_float8_infinity();
|
|
|
|
so->distances[i].isnull = false;
|
2019-09-08 20:13:40 +02:00
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
return true;
|
2008-08-23 12:37:24 +02:00
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
/* Check whether it matches according to the Consistent functions */
|
|
|
|
while (keySize > 0)
|
2008-08-23 12:37:24 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
Datum datum;
|
|
|
|
bool isNull;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
datum = index_getattr(tuple,
|
|
|
|
key->sk_attno,
|
2019-03-10 09:36:47 +01:00
|
|
|
giststate->leafTupdesc,
|
2010-12-04 02:52:18 +01:00
|
|
|
&isNull);
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (key->sk_flags & SK_ISNULL)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* On non-leaf page we can't conclude that child hasn't NULL
|
|
|
|
* values because of assumption in GiST: union (VAL, NULL) is VAL.
|
|
|
|
* But if on non-leaf page key IS NULL, then all children are
|
|
|
|
* NULL.
|
|
|
|
*/
|
|
|
|
if (key->sk_flags & SK_SEARCHNULL)
|
2005-09-22 22:44:36 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
if (GistPageIsLeaf(page) && !isNull)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Assert(key->sk_flags & SK_SEARCHNOTNULL);
|
|
|
|
if (isNull)
|
|
|
|
return false;
|
2005-09-22 22:44:36 +02:00
|
|
|
}
|
2005-06-27 14:45:23 +02:00
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
else if (isNull)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Datum test;
|
|
|
|
bool recheck;
|
|
|
|
GISTENTRY de;
|
2008-08-23 12:37:24 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
gistdentryinit(giststate, key->sk_attno - 1, &de,
|
|
|
|
datum, r, page, offset,
|
2017-08-16 06:22:32 +02:00
|
|
|
false, isNull);
|
1996-08-26 22:02:12 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/*
|
|
|
|
* Call the Consistent function to evaluate the test. The
|
|
|
|
* arguments are the index datum (as a GISTENTRY*), the comparison
|
|
|
|
* datum, the comparison operator's strategy number and subtype
|
|
|
|
* from pg_amop, and the recheck flag.
|
|
|
|
*
|
|
|
|
* (Presently there's no need to pass the subtype since it'll
|
|
|
|
* always be zero, but might as well pass it for possible future
|
|
|
|
* use.)
|
|
|
|
*
|
|
|
|
* We initialize the recheck flag to true (the safest assumption)
|
|
|
|
* in case the Consistent function forgets to set it.
|
|
|
|
*/
|
|
|
|
recheck = true;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-04-13 01:19:24 +02:00
|
|
|
test = FunctionCall5Coll(&key->sk_func,
|
|
|
|
key->sk_collation,
|
|
|
|
PointerGetDatum(&de),
|
|
|
|
key->sk_argument,
|
Fix assorted inconsistencies in GiST opclass support function declarations.
The conventions specified by the GiST SGML documentation were widely
ignored. For example, the strategy-number argument for "consistent" and
"distance" functions is specified to be a smallint, but most of the
built-in support functions declared it as an integer, and for that matter
the core code passed it using Int32GetDatum not Int16GetDatum. None of
that makes any real difference at runtime, but it's quite confusing for
newcomers to the code, and it makes it very hard to write an amvalidate()
function that checks support function signatures. So let's try to instill
some consistency here.
Another similar issue is that the "query" argument is not of a single
well-defined type, but could have different types depending on the strategy
(corresponding to search operators with different righthand-side argument
types). Some of the functions threw up their hands and declared the query
argument as being of "internal" type, which surely isn't right ("any" would
have been more appropriate); but the majority position seemed to be to
declare it as being of the indexed data type, corresponding to a search
operator with both input types the same. So I've specified a convention
that that's what to do always.
Also, the result of the "union" support function actually must be of the
index's storage type, but the documentation suggested declaring it to
return "internal", and some of the functions followed that. Standardize
on telling the truth, instead.
Similarly, standardize on declaring the "same" function's inputs as
being of the storage type, not "internal".
Also, somebody had forgotten to add the "recheck" argument to both
the documentation of the "distance" support function and all of their
SQL declarations, even though the C code was happily using that argument.
Clean that up too.
Fix up some other omissions in the docs too, such as documenting that
union's second input argument is vestigial.
So far as the errors in core function declarations go, we can just fix
pg_proc.h and bump catversion. Adjusting the erroneous declarations in
contrib modules is more debatable: in principle any change in those
scripts should involve an extension version bump, which is a pain.
However, since these changes are purely cosmetic and make no functional
difference, I think we can get away without doing that.
2016-01-19 18:04:32 +01:00
|
|
|
Int16GetDatum(key->sk_strategy),
|
2011-04-13 01:19:24 +02:00
|
|
|
ObjectIdGetDatum(key->sk_subtype),
|
|
|
|
PointerGetDatum(&recheck));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (!DatumGetBool(test))
|
|
|
|
return false;
|
|
|
|
*recheck_p |= recheck;
|
|
|
|
}
|
2008-10-20 18:35:14 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
key++;
|
|
|
|
keySize--;
|
|
|
|
}
|
2005-05-17 02:59:30 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* OK, it passes --- now let's compute the distances */
|
|
|
|
key = scan->orderByData;
|
2019-09-19 20:30:19 +02:00
|
|
|
distance_p = so->distances;
|
2010-12-04 02:52:18 +01:00
|
|
|
keySize = scan->numberOfOrderBys;
|
|
|
|
while (keySize > 0)
|
|
|
|
{
|
|
|
|
Datum datum;
|
|
|
|
bool isNull;
|
2005-05-17 02:59:30 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
datum = index_getattr(tuple,
|
|
|
|
key->sk_attno,
|
2019-03-10 09:36:47 +01:00
|
|
|
giststate->leafTupdesc,
|
2010-12-04 02:52:18 +01:00
|
|
|
&isNull);
|
1996-08-26 22:02:12 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if ((key->sk_flags & SK_ISNULL) || isNull)
|
|
|
|
{
|
2019-09-08 20:13:40 +02:00
|
|
|
/* Assume distance computes as null */
|
2019-09-19 20:30:19 +02:00
|
|
|
distance_p->value = 0.0;
|
|
|
|
distance_p->isnull = true;
|
2010-12-04 02:52:18 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Datum dist;
|
2015-05-15 13:26:51 +02:00
|
|
|
bool recheck;
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTENTRY de;
|
2005-03-28 01:53:05 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
gistdentryinit(giststate, key->sk_attno - 1, &de,
|
|
|
|
datum, r, page, offset,
|
2017-08-16 06:22:32 +02:00
|
|
|
false, isNull);
|
2005-09-22 22:44:36 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/*
|
|
|
|
* Call the Distance function to evaluate the distance. The
|
|
|
|
* arguments are the index datum (as a GISTENTRY*), the comparison
|
2015-05-23 21:22:25 +02:00
|
|
|
* datum, the ordering operator's strategy number and subtype from
|
|
|
|
* pg_amop, and the recheck flag.
|
2010-12-04 02:52:18 +01:00
|
|
|
*
|
|
|
|
* (Presently there's no need to pass the subtype since it'll
|
|
|
|
* always be zero, but might as well pass it for possible future
|
|
|
|
* use.)
|
|
|
|
*
|
2015-05-23 21:22:25 +02:00
|
|
|
* If the function sets the recheck flag, the returned distance is
|
|
|
|
* a lower bound on the true distance and needs to be rechecked.
|
|
|
|
* We initialize the flag to 'false'. This flag was added in
|
|
|
|
* version 9.5; distance functions written before that won't know
|
|
|
|
* about the flag, but are expected to never be lossy.
|
2010-12-04 02:52:18 +01:00
|
|
|
*/
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
recheck = false;
|
2015-05-15 13:26:51 +02:00
|
|
|
dist = FunctionCall5Coll(&key->sk_func,
|
2011-04-13 01:19:24 +02:00
|
|
|
key->sk_collation,
|
|
|
|
PointerGetDatum(&de),
|
|
|
|
key->sk_argument,
|
Fix assorted inconsistencies in GiST opclass support function declarations.
The conventions specified by the GiST SGML documentation were widely
ignored. For example, the strategy-number argument for "consistent" and
"distance" functions is specified to be a smallint, but most of the
built-in support functions declared it as an integer, and for that matter
the core code passed it using Int32GetDatum not Int16GetDatum. None of
that makes any real difference at runtime, but it's quite confusing for
newcomers to the code, and it makes it very hard to write an amvalidate()
function that checks support function signatures. So let's try to instill
some consistency here.
Another similar issue is that the "query" argument is not of a single
well-defined type, but could have different types depending on the strategy
(corresponding to search operators with different righthand-side argument
types). Some of the functions threw up their hands and declared the query
argument as being of "internal" type, which surely isn't right ("any" would
have been more appropriate); but the majority position seemed to be to
declare it as being of the indexed data type, corresponding to a search
operator with both input types the same. So I've specified a convention
that that's what to do always.
Also, the result of the "union" support function actually must be of the
index's storage type, but the documentation suggested declaring it to
return "internal", and some of the functions followed that. Standardize
on telling the truth, instead.
Similarly, standardize on declaring the "same" function's inputs as
being of the storage type, not "internal".
Also, somebody had forgotten to add the "recheck" argument to both
the documentation of the "distance" support function and all of their
SQL declarations, even though the C code was happily using that argument.
Clean that up too.
Fix up some other omissions in the docs too, such as documenting that
union's second input argument is vestigial.
So far as the errors in core function declarations go, we can just fix
pg_proc.h and bump catversion. Adjusting the erroneous declarations in
contrib modules is more debatable: in principle any change in those
scripts should involve an extension version bump, which is a pain.
However, since these changes are purely cosmetic and make no functional
difference, I think we can get away without doing that.
2016-01-19 18:04:32 +01:00
|
|
|
Int16GetDatum(key->sk_strategy),
|
2015-05-15 13:26:51 +02:00
|
|
|
ObjectIdGetDatum(key->sk_subtype),
|
|
|
|
PointerGetDatum(&recheck));
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
*recheck_distances_p |= recheck;
|
2019-09-19 20:30:19 +02:00
|
|
|
distance_p->value = DatumGetFloat8(dist);
|
|
|
|
distance_p->isnull = false;
|
2010-12-04 02:52:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
key++;
|
2019-09-19 20:30:19 +02:00
|
|
|
distance_p++;
|
2010-12-04 02:52:18 +01:00
|
|
|
keySize--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2005-03-28 01:53:05 +02:00
|
|
|
}
|
|
|
|
|
2005-05-17 02:59:30 +02:00
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* Scan all items on the GiST index page identified by *pageItem, and insert
|
|
|
|
* them into the queue (or directly to output areas)
|
2008-04-11 00:25:26 +02:00
|
|
|
*
|
2010-12-04 02:52:18 +01:00
|
|
|
* scan: index scan we are executing
|
|
|
|
* pageItem: search queue item identifying an index page to scan
|
2019-09-19 20:30:19 +02:00
|
|
|
* myDistances: distances array associated with pageItem, or NULL at the root
|
2010-12-04 02:52:18 +01:00
|
|
|
* tbm: if not NULL, gistgetbitmap's output bitmap
|
|
|
|
* ntids: if not NULL, gistgetbitmap's output tuple counter
|
2008-04-13 21:18:14 +02:00
|
|
|
*
|
2010-12-04 02:52:18 +01:00
|
|
|
* If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap
|
|
|
|
* tuples should be reported directly into the bitmap. If they are NULL,
|
|
|
|
* we're doing a plain or ordered indexscan. For a plain indexscan, heap
|
|
|
|
* tuple TIDs are returned into so->pageData[]. For an ordered indexscan,
|
2015-03-26 18:12:00 +01:00
|
|
|
* heap tuple TIDs are pushed into individual search queue items. In an
|
|
|
|
* index-only scan, reconstructed index tuples are returned along with the
|
|
|
|
* TIDs.
|
2010-12-04 02:52:18 +01:00
|
|
|
*
|
|
|
|
* If we detect that the index page has split since we saw its downlink
|
|
|
|
* in the parent, we push its new right sibling onto the queue so the
|
|
|
|
* sibling will be processed next.
|
2005-05-17 02:59:30 +02:00
|
|
|
*/
|
2010-12-04 02:52:18 +01:00
|
|
|
static void
|
2019-09-08 20:13:40 +02:00
|
|
|
gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
|
2019-09-19 20:30:19 +02:00
|
|
|
IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids)
|
1996-08-26 22:02:12 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
|
2015-03-26 18:12:00 +01:00
|
|
|
GISTSTATE *giststate = so->giststate;
|
|
|
|
Relation r = scan->indexRelation;
|
2010-12-04 02:52:18 +01:00
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
2005-09-22 22:44:36 +02:00
|
|
|
GISTPageOpaque opaque;
|
2010-12-04 02:52:18 +01:00
|
|
|
OffsetNumber maxoff;
|
|
|
|
OffsetNumber i;
|
|
|
|
MemoryContext oldcxt;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
Assert(!GISTSearchItemIsHeap(*pageItem));
|
2005-05-17 02:59:30 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
buffer = ReadBuffer(scan->indexRelation, pageItem->blkno);
|
|
|
|
LockBuffer(buffer, GIST_SHARE);
|
2018-03-27 14:43:19 +02:00
|
|
|
PredicateLockPage(r, BufferGetBlockNumber(buffer), scan->xs_snapshot);
|
2010-12-04 02:52:18 +01:00
|
|
|
gistcheckpage(scan->indexRelation, buffer);
|
2016-04-20 15:31:19 +02:00
|
|
|
page = BufferGetPage(buffer);
|
|
|
|
TestForOldSnapshot(scan->xs_snapshot, r, page);
|
2010-12-04 02:52:18 +01:00
|
|
|
opaque = GistPageGetOpaque(page);
|
2008-10-17 19:02:21 +02:00
|
|
|
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
/*
|
|
|
|
* Check if we need to follow the rightlink. We need to follow it if the
|
|
|
|
* page was concurrently split since we visited the parent (in which case
|
2012-05-02 15:27:34 +02:00
|
|
|
* parentlsn < nsn), or if the system crashed after a page split but
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
* before the downlink was inserted into the parent.
|
|
|
|
*/
|
2010-12-04 02:52:18 +01:00
|
|
|
if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) &&
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
2010-12-23 15:03:08 +01:00
|
|
|
(GistFollowRight(page) ||
|
2013-01-17 15:35:46 +01:00
|
|
|
pageItem->data.parentlsn < GistPageGetNSN(page)) &&
|
2010-12-04 02:52:18 +01:00
|
|
|
opaque->rightlink != InvalidBlockNumber /* sanity check */ )
|
2005-05-17 02:59:30 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
/* There was a page split, follow right link to add pages */
|
|
|
|
GISTSearchItem *item;
|
2005-02-05 20:38:58 +01:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* This can't happen when starting at the root */
|
2019-09-19 20:30:19 +02:00
|
|
|
Assert(myDistances != NULL);
|
2005-09-22 22:44:36 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
oldcxt = MemoryContextSwitchTo(so->queueCxt);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* Create new GISTSearchItem for the right sibling index page */
|
2014-12-22 11:05:57 +01:00
|
|
|
item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys));
|
2010-12-04 02:52:18 +01:00
|
|
|
item->blkno = opaque->rightlink;
|
|
|
|
item->data.parentlsn = pageItem->data.parentlsn;
|
2005-10-06 04:29:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* Insert it into the queue using same distances as for this page */
|
2019-09-19 20:30:19 +02:00
|
|
|
memcpy(item->distances, myDistances,
|
|
|
|
sizeof(item->distances[0]) * scan->numberOfOrderBys);
|
2010-12-04 02:52:18 +01:00
|
|
|
|
2014-12-22 11:05:57 +01:00
|
|
|
pairingheap_add(so->queue, &item->phNode);
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldcxt);
|
1996-08-26 22:02:12 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2019-07-24 19:24:05 +02:00
|
|
|
/*
|
|
|
|
* Check if the page was deleted after we saw the downlink. There's
|
2019-08-05 05:14:58 +02:00
|
|
|
* nothing of interest on a deleted page. Note that we must do this after
|
|
|
|
* checking the NSN for concurrent splits! It's possible that the page
|
|
|
|
* originally contained some tuples that are visible to us, but was split
|
|
|
|
* so that all the visible tuples were moved to another page, and then
|
|
|
|
* this page was deleted.
|
2019-07-24 19:24:05 +02:00
|
|
|
*/
|
|
|
|
if (GistPageIsDeleted(page))
|
|
|
|
{
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
so->nPageData = so->curPageData = 0;
|
Fix pfree-of-already-freed-tuple when rescanning a GiST index-only scan.
GiST's getNextNearest() function attempts to pfree the previously-returned
tuple if any (that is, scan->xs_hitup in HEAD, or scan->xs_itup in older
branches). However, if we are rescanning a plan node after ending a
previous scan early, those tuple pointers could be pointing to garbage,
because they would be pointing into the scan's pageDataCxt or queueCxt
which has been reset. In a debug build this reliably results in a crash,
although I think it might sometimes accidentally fail to fail in
production builds.
To fix, clear the pointer field anyplace we reset a context it might
be pointing into. This may be overkill --- I think probably only the
queueCxt case is involved in this bug, so that resetting in gistrescan()
would be sufficient --- but dangling pointers are generally bad news,
so let's avoid them.
Another plausible answer might be to just not bother with the pfree in
getNextNearest(). The reconstructed tuples would go away anyway in the
context resets, and I'm far from convinced that freeing them a bit earlier
really saves anything meaningful. I'll stick with the original logic in
this patch, but if we find more problems in the same area we should
consider that approach.
Per bug #14641 from Denis Smirnov. Back-patch to 9.5 where this
logic was introduced.
Discussion: https://postgr.es/m/20170504072034.24366.57688@wrigleys.postgresql.org
2017-05-04 19:59:13 +02:00
|
|
|
scan->xs_hitup = NULL; /* might point into pageDataCxt */
|
2015-03-26 18:12:00 +01:00
|
|
|
if (so->pageDataCxt)
|
|
|
|
MemoryContextReset(so->pageDataCxt);
|
2010-12-04 02:52:18 +01:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
/*
|
|
|
|
* We save the LSN of the page as we read it, so that we know whether it
|
|
|
|
* safe to apply LP_DEAD hints to the page later. This allows us to drop
|
|
|
|
* the pin for MVCC scans, which allows vacuum to avoid blocking.
|
|
|
|
*/
|
2018-01-09 19:54:39 +01:00
|
|
|
so->curPageLSN = BufferGetLSNAtomic(buffer);
|
2015-09-09 17:43:37 +02:00
|
|
|
|
2008-08-23 12:37:24 +02:00
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* check all tuples on page
|
2008-08-23 12:37:24 +02:00
|
|
|
*/
|
2010-12-04 02:52:18 +01:00
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
|
2008-08-23 12:37:24 +02:00
|
|
|
{
|
2015-09-09 17:43:37 +02:00
|
|
|
ItemId iid = PageGetItemId(page, i);
|
|
|
|
IndexTuple it;
|
2010-12-04 02:52:18 +01:00
|
|
|
bool match;
|
|
|
|
bool recheck;
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
bool recheck_distances;
|
2010-12-04 02:52:18 +01:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
/*
|
|
|
|
* If the scan specifies not to return killed tuples, then we treat a
|
|
|
|
* killed tuple as not passing the qual.
|
|
|
|
*/
|
|
|
|
if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
it = (IndexTuple) PageGetItem(page, iid);
|
2016-06-10 00:02:36 +02:00
|
|
|
|
2008-08-23 12:37:24 +02:00
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* Must call gistindex_keytest in tempCxt, and clean up any leftover
|
|
|
|
* junk afterward.
|
2008-08-23 12:37:24 +02:00
|
|
|
*/
|
2011-10-01 01:48:57 +02:00
|
|
|
oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt);
|
2008-08-23 12:37:24 +02:00
|
|
|
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
match = gistindex_keytest(scan, it, page, i,
|
|
|
|
&recheck, &recheck_distances);
|
2008-10-22 14:53:56 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
MemoryContextSwitchTo(oldcxt);
|
2011-10-01 01:48:57 +02:00
|
|
|
MemoryContextReset(so->giststate->tempCxt);
|
2008-10-22 14:53:56 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* Ignore tuple if it doesn't match */
|
|
|
|
if (!match)
|
|
|
|
continue;
|
2008-08-23 12:37:24 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (tbm && GistPageIsLeaf(page))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* getbitmap scan, so just push heap tuple TIDs into the bitmap
|
|
|
|
* without worrying about ordering
|
|
|
|
*/
|
|
|
|
tbm_add_tuples(tbm, &it->t_tid, 1, recheck);
|
|
|
|
(*ntids)++;
|
|
|
|
}
|
|
|
|
else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page))
|
|
|
|
{
|
|
|
|
/*
|
2015-03-26 18:12:00 +01:00
|
|
|
* Non-ordered scan, so report tuples in so->pageData[]
|
2010-12-04 02:52:18 +01:00
|
|
|
*/
|
|
|
|
so->pageData[so->nPageData].heapPtr = it->t_tid;
|
|
|
|
so->pageData[so->nPageData].recheck = recheck;
|
2015-09-09 17:43:37 +02:00
|
|
|
so->pageData[so->nPageData].offnum = i;
|
2015-03-26 18:12:00 +01:00
|
|
|
|
|
|
|
/*
|
2017-02-27 23:20:34 +01:00
|
|
|
* In an index-only scan, also fetch the data from the tuple. The
|
|
|
|
* reconstructed tuples are stored in pageDataCxt.
|
2015-03-26 18:12:00 +01:00
|
|
|
*/
|
|
|
|
if (scan->xs_want_itup)
|
|
|
|
{
|
|
|
|
oldcxt = MemoryContextSwitchTo(so->pageDataCxt);
|
2017-02-27 23:20:34 +01:00
|
|
|
so->pageData[so->nPageData].recontup =
|
2015-03-26 18:12:00 +01:00
|
|
|
gistFetchTuple(giststate, r, it);
|
|
|
|
MemoryContextSwitchTo(oldcxt);
|
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
so->nPageData++;
|
2008-08-23 12:37:24 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* Must push item into search queue. We get here for any lower
|
|
|
|
* index page, and also for heap tuples if doing an ordered
|
|
|
|
* search.
|
2008-08-23 12:37:24 +02:00
|
|
|
*/
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTSearchItem *item;
|
2019-09-08 20:13:40 +02:00
|
|
|
int nOrderBys = scan->numberOfOrderBys;
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
oldcxt = MemoryContextSwitchTo(so->queueCxt);
|
2008-08-23 12:37:24 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* Create new GISTSearchItem for this item */
|
2014-12-22 11:05:57 +01:00
|
|
|
item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys));
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
if (GistPageIsLeaf(page))
|
2008-08-23 12:37:24 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
/* Creating heap-tuple GISTSearchItem */
|
|
|
|
item->blkno = InvalidBlockNumber;
|
|
|
|
item->data.heap.heapPtr = it->t_tid;
|
|
|
|
item->data.heap.recheck = recheck;
|
Fix datatype confusion with the new lossy GiST distance functions.
We can only support a lossy distance function when the distance function's
datatype is comparable with the original ordering operator's datatype.
The distance function always returns a float8, so we are limited to float8,
and float4 (by a hard-coded cast of the float8 to float4).
In light of this limitation, it seems like a good idea to have a separate
'recheck' flag for the ORDER BY expressions, so that if you have a non-lossy
distance function, it still works with lossy quals. There are cases like
that with the build-in or contrib opclasses, but it's plausible.
There was a hidden assumption that the ORDER BY values returned by GiST
match the original ordering operator's return type, but there are plenty
of examples where that's not true, e.g. in btree_gist and pg_trgm. As long
as the distance function is not lossy, we can tolerate that and just not
return the distance to the executor (or rather, always return NULL). The
executor doesn't need the distances if there are no lossy results.
There was another little bug: the recheck variable was not initialized
before calling the distance function. That revealed the bigger issue,
as the executor tried to reorder tuples that didn't need reordering, and
that failed because of the datatype mismatch.
2015-05-15 16:59:46 +02:00
|
|
|
item->data.heap.recheckDistances = recheck_distances;
|
2015-03-26 18:12:00 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* In an index-only scan, also fetch the data from the tuple.
|
|
|
|
*/
|
|
|
|
if (scan->xs_want_itup)
|
2017-02-27 23:20:34 +01:00
|
|
|
item->data.heap.recontup = gistFetchTuple(giststate, r, it);
|
2008-08-23 12:37:24 +02:00
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Creating index-page GISTSearchItem */
|
|
|
|
item->blkno = ItemPointerGetBlockNumber(&it->t_tid);
|
2013-03-22 14:54:07 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* LSN of current page is lsn of parent page for child. We
|
|
|
|
* only have a shared lock, so we need to get the LSN
|
|
|
|
* atomically.
|
|
|
|
*/
|
|
|
|
item->data.parentlsn = BufferGetLSNAtomic(buffer);
|
2010-12-04 02:52:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Insert it into the queue using new distance data */
|
2019-09-19 20:30:19 +02:00
|
|
|
memcpy(item->distances, so->distances,
|
|
|
|
sizeof(item->distances[0]) * nOrderBys);
|
2008-08-23 12:37:24 +02:00
|
|
|
|
2014-12-22 11:05:57 +01:00
|
|
|
pairingheap_add(so->queue, &item->phNode);
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldcxt);
|
2008-08-23 12:37:24 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract next item (in order) from search queue
|
|
|
|
*
|
|
|
|
* Returns a GISTSearchItem or NULL. Caller must pfree item when done with it.
|
|
|
|
*/
|
|
|
|
static GISTSearchItem *
|
|
|
|
getNextGISTSearchItem(GISTScanOpaque so)
|
|
|
|
{
|
2014-12-22 11:05:57 +01:00
|
|
|
GISTSearchItem *item;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2014-12-22 11:05:57 +01:00
|
|
|
if (!pairingheap_is_empty(so->queue))
|
|
|
|
{
|
|
|
|
item = (GISTSearchItem *) pairingheap_remove_first(so->queue);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Done when both heaps are empty */
|
|
|
|
item = NULL;
|
2010-12-04 02:52:18 +01:00
|
|
|
}
|
2008-08-23 12:37:24 +02:00
|
|
|
|
2014-12-22 11:05:57 +01:00
|
|
|
/* Return item; caller is responsible to pfree it */
|
|
|
|
return item;
|
2010-12-04 02:52:18 +01:00
|
|
|
}
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/*
|
|
|
|
* Fetch next heap tuple in an ordered search
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
getNextNearest(IndexScanDesc scan)
|
|
|
|
{
|
|
|
|
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
|
|
|
|
bool res = false;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2017-02-27 23:20:34 +01:00
|
|
|
if (scan->xs_hitup)
|
2015-03-26 18:12:00 +01:00
|
|
|
{
|
|
|
|
/* free previously returned tuple */
|
2017-02-27 23:20:34 +01:00
|
|
|
pfree(scan->xs_hitup);
|
|
|
|
scan->xs_hitup = NULL;
|
2015-03-26 18:12:00 +01:00
|
|
|
}
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
do
|
|
|
|
{
|
|
|
|
GISTSearchItem *item = getNextGISTSearchItem(so);
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (!item)
|
|
|
|
break;
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (GISTSearchItemIsHeap(*item))
|
|
|
|
{
|
|
|
|
/* found a heap item at currently minimal distance */
|
tableam: Add and use scan APIs.
Too allow table accesses to be not directly dependent on heap, several
new abstractions are needed. Specifically:
1) Heap scans need to be generalized into table scans. Do this by
introducing TableScanDesc, which will be the "base class" for
individual AMs. This contains the AM independent fields from
HeapScanDesc.
The previous heap_{beginscan,rescan,endscan} et al. have been
replaced with a table_ version.
There's no direct replacement for heap_getnext(), as that returned
a HeapTuple, which is undesirable for a other AMs. Instead there's
table_scan_getnextslot(). But note that heap_getnext() lives on,
it's still used widely to access catalog tables.
This is achieved by new scan_begin, scan_end, scan_rescan,
scan_getnextslot callbacks.
2) The portion of parallel scans that's shared between backends need
to be able to do so without the user doing per-AM work. To achieve
that new parallelscan_{estimate, initialize, reinitialize}
callbacks are introduced, which operate on a new
ParallelTableScanDesc, which again can be subclassed by AMs.
As it is likely that several AMs are going to be block oriented,
block oriented callbacks that can be shared between such AMs are
provided and used by heap. table_block_parallelscan_{estimate,
intiialize, reinitialize} as callbacks, and
table_block_parallelscan_{nextpage, init} for use in AMs. These
operate on a ParallelBlockTableScanDesc.
3) Index scans need to be able to access tables to return a tuple, and
there needs to be state across individual accesses to the heap to
store state like buffers. That's now handled by introducing a
sort-of-scan IndexFetchTable, which again is intended to be
subclassed by individual AMs (for heap IndexFetchHeap).
The relevant callbacks for an AM are index_fetch_{end, begin,
reset} to create the necessary state, and index_fetch_tuple to
retrieve an indexed tuple. Note that index_fetch_tuple
implementations need to be smarter than just blindly fetching the
tuples for AMs that have optimizations similar to heap's HOT - the
currently alive tuple in the update chain needs to be fetched if
appropriate.
Similar to table_scan_getnextslot(), it's undesirable to continue
to return HeapTuples. Thus index_fetch_heap (might want to rename
that later) now accepts a slot as an argument. Core code doesn't
have a lot of call sites performing index scans without going
through the systable_* API (in contrast to loads of heap_getnext
calls and working directly with HeapTuples).
Index scans now store the result of a search in
IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the
target is not generally a HeapTuple anymore that seems cleaner.
To be able to sensible adapt code to use the above, two further
callbacks have been introduced:
a) slot_callbacks returns a TupleTableSlotOps* suitable for creating
slots capable of holding a tuple of the AMs
type. table_slot_callbacks() and table_slot_create() are based
upon that, but have additional logic to deal with views, foreign
tables, etc.
While this change could have been done separately, nearly all the
call sites that needed to be adapted for the rest of this commit
also would have been needed to be adapted for
table_slot_callbacks(), making separation not worthwhile.
b) tuple_satisfies_snapshot checks whether the tuple in a slot is
currently visible according to a snapshot. That's required as a few
places now don't have a buffer + HeapTuple around, but a
slot (which in heap's case internally has that information).
Additionally a few infrastructure changes were needed:
I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now
internally uses a slot to keep track of tuples. While
systable_getnext() still returns HeapTuples, and will so for the
foreseeable future, the index API (see 1) above) now only deals with
slots.
The remainder, and largest part, of this commit is then adjusting all
scans in postgres to use the new APIs.
Author: Andres Freund, Haribabu Kommi, Alvaro Herrera
Discussion:
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
|
|
|
scan->xs_heaptid = item->data.heap.heapPtr;
|
2010-12-04 02:52:18 +01:00
|
|
|
scan->xs_recheck = item->data.heap.recheck;
|
2018-09-19 00:54:10 +02:00
|
|
|
|
|
|
|
index_store_float8_orderby_distances(scan, so->orderByTypes,
|
2019-09-19 20:30:19 +02:00
|
|
|
item->distances,
|
2018-09-19 00:54:10 +02:00
|
|
|
item->data.heap.recheckDistances);
|
2015-03-26 18:12:00 +01:00
|
|
|
|
|
|
|
/* in an index-only scan, also return the reconstructed tuple. */
|
|
|
|
if (scan->xs_want_itup)
|
2017-02-27 23:20:34 +01:00
|
|
|
scan->xs_hitup = item->data.heap.recontup;
|
2010-12-04 02:52:18 +01:00
|
|
|
res = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* visit an index page, extract its items into queue */
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2019-09-19 20:30:19 +02:00
|
|
|
gistScanPage(scan, item, item->distances, NULL, NULL);
|
1997-09-07 07:04:48 +02:00
|
|
|
}
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
pfree(item);
|
|
|
|
} while (!res);
|
|
|
|
|
|
|
|
return res;
|
1996-08-26 22:02:12 +02:00
|
|
|
}
|
|
|
|
|
2005-05-17 02:59:30 +02:00
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* gistgettuple() -- Get the next tuple in the scan
|
2005-05-17 02:59:30 +02:00
|
|
|
*/
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
bool
|
|
|
|
gistgettuple(IndexScanDesc scan, ScanDirection dir)
|
1996-08-26 22:02:12 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
|
2005-05-17 02:59:30 +02:00
|
|
|
|
2010-12-04 04:43:01 +01:00
|
|
|
if (dir != ForwardScanDirection)
|
|
|
|
elog(ERROR, "GiST only supports forward scan direction");
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (!so->qual_ok)
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return false;
|
2005-05-17 02:59:30 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (so->firstCall)
|
|
|
|
{
|
|
|
|
/* Begin the scan by processing the root page */
|
|
|
|
GISTSearchItem fakeItem;
|
2008-04-14 19:05:34 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
pgstat_count_index_scan(scan->indexRelation);
|
2005-06-20 12:29:37 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
so->firstCall = false;
|
|
|
|
so->curPageData = so->nPageData = 0;
|
Fix pfree-of-already-freed-tuple when rescanning a GiST index-only scan.
GiST's getNextNearest() function attempts to pfree the previously-returned
tuple if any (that is, scan->xs_hitup in HEAD, or scan->xs_itup in older
branches). However, if we are rescanning a plan node after ending a
previous scan early, those tuple pointers could be pointing to garbage,
because they would be pointing into the scan's pageDataCxt or queueCxt
which has been reset. In a debug build this reliably results in a crash,
although I think it might sometimes accidentally fail to fail in
production builds.
To fix, clear the pointer field anyplace we reset a context it might
be pointing into. This may be overkill --- I think probably only the
queueCxt case is involved in this bug, so that resetting in gistrescan()
would be sufficient --- but dangling pointers are generally bad news,
so let's avoid them.
Another plausible answer might be to just not bother with the pfree in
getNextNearest(). The reconstructed tuples would go away anyway in the
context resets, and I'm far from convinced that freeing them a bit earlier
really saves anything meaningful. I'll stick with the original logic in
this patch, but if we find more problems in the same area we should
consider that approach.
Per bug #14641 from Denis Smirnov. Back-patch to 9.5 where this
logic was introduced.
Discussion: https://postgr.es/m/20170504072034.24366.57688@wrigleys.postgresql.org
2017-05-04 19:59:13 +02:00
|
|
|
scan->xs_hitup = NULL;
|
2015-03-26 18:12:00 +01:00
|
|
|
if (so->pageDataCxt)
|
|
|
|
MemoryContextReset(so->pageDataCxt);
|
2003-11-12 22:15:59 +01:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
fakeItem.blkno = GIST_ROOT_BLKNO;
|
|
|
|
memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
|
2019-09-19 20:30:19 +02:00
|
|
|
gistScanPage(scan, &fakeItem, NULL, NULL, NULL);
|
2010-12-04 02:52:18 +01:00
|
|
|
}
|
2006-05-24 13:01:39 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (scan->numberOfOrderBys > 0)
|
|
|
|
{
|
|
|
|
/* Must fetch tuples in strict distance order */
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return getNextNearest(scan);
|
2010-12-04 02:52:18 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* Fetch tuples index-page-at-a-time */
|
|
|
|
for (;;)
|
2006-05-24 13:01:39 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
if (so->curPageData < so->nPageData)
|
2010-01-01 22:53:49 +01:00
|
|
|
{
|
2015-09-09 17:43:37 +02:00
|
|
|
if (scan->kill_prior_tuple && so->curPageData > 0)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (so->killedItems == NULL)
|
|
|
|
{
|
|
|
|
MemoryContext oldCxt =
|
|
|
|
MemoryContextSwitchTo(so->giststate->scanCxt);
|
|
|
|
|
|
|
|
so->killedItems =
|
|
|
|
(OffsetNumber *) palloc(MaxIndexTuplesPerPage
|
|
|
|
* sizeof(OffsetNumber));
|
2015-03-26 18:12:00 +01:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
MemoryContextSwitchTo(oldCxt);
|
|
|
|
}
|
|
|
|
if (so->numKilled < MaxIndexTuplesPerPage)
|
|
|
|
so->killedItems[so->numKilled++] =
|
|
|
|
so->pageData[so->curPageData - 1].offnum;
|
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
/* continuing to return tuples from a leaf page */
|
tableam: Add and use scan APIs.
Too allow table accesses to be not directly dependent on heap, several
new abstractions are needed. Specifically:
1) Heap scans need to be generalized into table scans. Do this by
introducing TableScanDesc, which will be the "base class" for
individual AMs. This contains the AM independent fields from
HeapScanDesc.
The previous heap_{beginscan,rescan,endscan} et al. have been
replaced with a table_ version.
There's no direct replacement for heap_getnext(), as that returned
a HeapTuple, which is undesirable for a other AMs. Instead there's
table_scan_getnextslot(). But note that heap_getnext() lives on,
it's still used widely to access catalog tables.
This is achieved by new scan_begin, scan_end, scan_rescan,
scan_getnextslot callbacks.
2) The portion of parallel scans that's shared between backends need
to be able to do so without the user doing per-AM work. To achieve
that new parallelscan_{estimate, initialize, reinitialize}
callbacks are introduced, which operate on a new
ParallelTableScanDesc, which again can be subclassed by AMs.
As it is likely that several AMs are going to be block oriented,
block oriented callbacks that can be shared between such AMs are
provided and used by heap. table_block_parallelscan_{estimate,
intiialize, reinitialize} as callbacks, and
table_block_parallelscan_{nextpage, init} for use in AMs. These
operate on a ParallelBlockTableScanDesc.
3) Index scans need to be able to access tables to return a tuple, and
there needs to be state across individual accesses to the heap to
store state like buffers. That's now handled by introducing a
sort-of-scan IndexFetchTable, which again is intended to be
subclassed by individual AMs (for heap IndexFetchHeap).
The relevant callbacks for an AM are index_fetch_{end, begin,
reset} to create the necessary state, and index_fetch_tuple to
retrieve an indexed tuple. Note that index_fetch_tuple
implementations need to be smarter than just blindly fetching the
tuples for AMs that have optimizations similar to heap's HOT - the
currently alive tuple in the update chain needs to be fetched if
appropriate.
Similar to table_scan_getnextslot(), it's undesirable to continue
to return HeapTuples. Thus index_fetch_heap (might want to rename
that later) now accepts a slot as an argument. Core code doesn't
have a lot of call sites performing index scans without going
through the systable_* API (in contrast to loads of heap_getnext
calls and working directly with HeapTuples).
Index scans now store the result of a search in
IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the
target is not generally a HeapTuple anymore that seems cleaner.
To be able to sensible adapt code to use the above, two further
callbacks have been introduced:
a) slot_callbacks returns a TupleTableSlotOps* suitable for creating
slots capable of holding a tuple of the AMs
type. table_slot_callbacks() and table_slot_create() are based
upon that, but have additional logic to deal with views, foreign
tables, etc.
While this change could have been done separately, nearly all the
call sites that needed to be adapted for the rest of this commit
also would have been needed to be adapted for
table_slot_callbacks(), making separation not worthwhile.
b) tuple_satisfies_snapshot checks whether the tuple in a slot is
currently visible according to a snapshot. That's required as a few
places now don't have a buffer + HeapTuple around, but a
slot (which in heap's case internally has that information).
Additionally a few infrastructure changes were needed:
I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now
internally uses a slot to keep track of tuples. While
systable_getnext() still returns HeapTuples, and will so for the
foreseeable future, the index API (see 1) above) now only deals with
slots.
The remainder, and largest part, of this commit is then adjusting all
scans in postgres to use the new APIs.
Author: Andres Freund, Haribabu Kommi, Alvaro Herrera
Discussion:
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
|
|
|
scan->xs_heaptid = so->pageData[so->curPageData].heapPtr;
|
2010-12-04 02:52:18 +01:00
|
|
|
scan->xs_recheck = so->pageData[so->curPageData].recheck;
|
2015-03-26 18:12:00 +01:00
|
|
|
|
|
|
|
/* in an index-only scan, also return the reconstructed tuple */
|
|
|
|
if (scan->xs_want_itup)
|
2017-02-27 23:20:34 +01:00
|
|
|
scan->xs_hitup = so->pageData[so->curPageData].recontup;
|
2015-03-26 18:12:00 +01:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
so->curPageData++;
|
2015-03-26 18:12:00 +01:00
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return true;
|
2010-01-01 22:53:49 +01:00
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
/*
|
2019-07-22 03:01:50 +02:00
|
|
|
* Check the last returned tuple and add it to killedItems if
|
2015-09-09 17:43:37 +02:00
|
|
|
* necessary
|
|
|
|
*/
|
|
|
|
if (scan->kill_prior_tuple
|
|
|
|
&& so->curPageData > 0
|
|
|
|
&& so->curPageData == so->nPageData)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (so->killedItems == NULL)
|
|
|
|
{
|
|
|
|
MemoryContext oldCxt =
|
|
|
|
MemoryContextSwitchTo(so->giststate->scanCxt);
|
|
|
|
|
|
|
|
so->killedItems =
|
|
|
|
(OffsetNumber *) palloc(MaxIndexTuplesPerPage
|
|
|
|
* sizeof(OffsetNumber));
|
|
|
|
|
|
|
|
MemoryContextSwitchTo(oldCxt);
|
|
|
|
}
|
|
|
|
if (so->numKilled < MaxIndexTuplesPerPage)
|
|
|
|
so->killedItems[so->numKilled++] =
|
|
|
|
so->pageData[so->curPageData - 1].offnum;
|
|
|
|
}
|
2010-12-04 02:52:18 +01:00
|
|
|
/* find and process the next index page */
|
|
|
|
do
|
2010-01-01 22:53:49 +01:00
|
|
|
{
|
2015-09-09 18:21:16 +02:00
|
|
|
GISTSearchItem *item;
|
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0))
|
|
|
|
gistkillitems(scan);
|
|
|
|
|
2015-09-09 18:21:16 +02:00
|
|
|
item = getNextGISTSearchItem(so);
|
2001-05-15 16:14:49 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
if (!item)
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return false;
|
2008-04-14 19:05:34 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
2007-04-07 00:33:43 +02:00
|
|
|
|
2015-09-09 17:43:37 +02:00
|
|
|
/* save current item BlockNumber for next gistkillitems() call */
|
|
|
|
so->curBlkno = item->blkno;
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/*
|
|
|
|
* While scanning a leaf page, ItemPointers of matching heap
|
|
|
|
* tuples are stored in so->pageData. If there are any on
|
|
|
|
* this page, we fall out of the inner "do" and loop around to
|
|
|
|
* return them.
|
|
|
|
*/
|
2019-09-19 20:30:19 +02:00
|
|
|
gistScanPage(scan, item, item->distances, NULL, NULL);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
pfree(item);
|
|
|
|
} while (so->nPageData == 0);
|
|
|
|
}
|
1996-08-26 22:02:12 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-05-17 02:59:30 +02:00
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* gistgetbitmap() -- Get a bitmap of all heap tuple locations
|
2005-05-17 02:59:30 +02:00
|
|
|
*/
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
int64
|
|
|
|
gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
|
1996-08-26 22:02:12 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
|
|
|
|
int64 ntids = 0;
|
|
|
|
GISTSearchItem fakeItem;
|
|
|
|
|
|
|
|
if (!so->qual_ok)
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return 0;
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
pgstat_count_index_scan(scan->indexRelation);
|
2005-05-17 02:59:30 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
/* Begin the scan by processing the root page */
|
|
|
|
so->curPageData = so->nPageData = 0;
|
Fix pfree-of-already-freed-tuple when rescanning a GiST index-only scan.
GiST's getNextNearest() function attempts to pfree the previously-returned
tuple if any (that is, scan->xs_hitup in HEAD, or scan->xs_itup in older
branches). However, if we are rescanning a plan node after ending a
previous scan early, those tuple pointers could be pointing to garbage,
because they would be pointing into the scan's pageDataCxt or queueCxt
which has been reset. In a debug build this reliably results in a crash,
although I think it might sometimes accidentally fail to fail in
production builds.
To fix, clear the pointer field anyplace we reset a context it might
be pointing into. This may be overkill --- I think probably only the
queueCxt case is involved in this bug, so that resetting in gistrescan()
would be sufficient --- but dangling pointers are generally bad news,
so let's avoid them.
Another plausible answer might be to just not bother with the pfree in
getNextNearest(). The reconstructed tuples would go away anyway in the
context resets, and I'm far from convinced that freeing them a bit earlier
really saves anything meaningful. I'll stick with the original logic in
this patch, but if we find more problems in the same area we should
consider that approach.
Per bug #14641 from Denis Smirnov. Back-patch to 9.5 where this
logic was introduced.
Discussion: https://postgr.es/m/20170504072034.24366.57688@wrigleys.postgresql.org
2017-05-04 19:59:13 +02:00
|
|
|
scan->xs_hitup = NULL;
|
2015-03-26 18:12:00 +01:00
|
|
|
if (so->pageDataCxt)
|
|
|
|
MemoryContextReset(so->pageDataCxt);
|
2010-12-04 02:52:18 +01:00
|
|
|
|
|
|
|
fakeItem.blkno = GIST_ROOT_BLKNO;
|
|
|
|
memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN));
|
2019-09-19 20:30:19 +02:00
|
|
|
gistScanPage(scan, &fakeItem, NULL, tbm, &ntids);
|
2005-05-17 02:59:30 +02:00
|
|
|
|
|
|
|
/*
|
2010-12-04 02:52:18 +01:00
|
|
|
* While scanning a leaf page, ItemPointers of matching heap tuples will
|
|
|
|
* be stored directly into tbm, so we don't need to deal with them here.
|
2005-05-17 02:59:30 +02:00
|
|
|
*/
|
2010-12-04 02:52:18 +01:00
|
|
|
for (;;)
|
1997-09-07 07:04:48 +02:00
|
|
|
{
|
2010-12-04 02:52:18 +01:00
|
|
|
GISTSearchItem *item = getNextGISTSearchItem(so);
|
|
|
|
|
|
|
|
if (!item)
|
1997-09-07 07:04:48 +02:00
|
|
|
break;
|
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
CHECK_FOR_INTERRUPTS();
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2019-09-19 20:30:19 +02:00
|
|
|
gistScanPage(scan, item, item->distances, tbm, &ntids);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2010-12-04 02:52:18 +01:00
|
|
|
pfree(item);
|
|
|
|
}
|
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return ntids;
|
1996-08-26 22:02:12 +02:00
|
|
|
}
|
2015-03-26 18:12:00 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Can we do index-only scans on the given index column?
|
|
|
|
*
|
|
|
|
* Opclasses that implement a fetch function support index-only scans.
|
2017-09-20 05:32:27 +02:00
|
|
|
* Opclasses without compression functions also support index-only scans.
|
2019-03-10 09:36:47 +01:00
|
|
|
* Included attributes always can be fetched for index-only scans.
|
2015-03-26 18:12:00 +01:00
|
|
|
*/
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
bool
|
|
|
|
gistcanreturn(Relation index, int attno)
|
2015-03-26 18:12:00 +01:00
|
|
|
{
|
2019-03-10 09:36:47 +01:00
|
|
|
if (attno > IndexRelationGetNumberOfKeyAttributes(index) ||
|
|
|
|
OidIsValid(index_getprocid(index, attno, GIST_FETCH_PROC)) ||
|
2017-09-20 05:32:27 +02:00
|
|
|
!OidIsValid(index_getprocid(index, attno, GIST_COMPRESS_PROC)))
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return true;
|
2015-03-26 18:12:00 +01:00
|
|
|
else
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
2016-01-18 01:36:59 +01:00
|
|
|
return false;
|
2015-03-26 18:12:00 +01:00
|
|
|
}
|