1996-07-09 08:22:35 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* indexam.c
|
1997-09-07 07:04:48 +02:00
|
|
|
* general index access method routines
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2011-01-01 19:18:15 +01:00
|
|
|
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/access/index/indexam.c
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* INTERFACE ROUTINES
|
2001-11-02 17:30:29 +01:00
|
|
|
* index_open - open an index relation by relation OID
|
|
|
|
* index_close - close an index relation
|
2005-03-28 01:53:05 +02:00
|
|
|
* index_beginscan - start a scan of an index with amgettuple
|
2008-04-11 00:25:26 +02:00
|
|
|
* index_beginscan_bitmap - start a scan of an index with amgetbitmap
|
1997-09-07 07:04:48 +02:00
|
|
|
* index_rescan - restart a scan of an index
|
|
|
|
* index_endscan - end a scan
|
|
|
|
* index_insert - insert an index tuple into a relation
|
|
|
|
* index_markpos - mark a scan position
|
|
|
|
* index_restrpos - restore a scan position
|
2011-10-08 02:13:02 +02:00
|
|
|
* index_getnext_tid - get the next TID from a scan
|
|
|
|
* index_fetch_heap - get the scan's next heap tuple
|
|
|
|
* index_getnext - get the next heap tuple from a scan
|
2009-06-11 16:49:15 +02:00
|
|
|
* index_getbitmap - get all tuples from a scan
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
* index_bulk_delete - bulk deletion of index tuples
|
2003-02-22 01:45:05 +01:00
|
|
|
* index_vacuum_cleanup - post-deletion cleanup of an index
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
* index_getprocid - get a support procedure OID
|
2005-05-28 01:31:21 +02:00
|
|
|
* index_getprocinfo - get a support procedure's lookup info
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
* NOTES
|
1997-09-07 07:04:48 +02:00
|
|
|
* This file contains the index_ routines which used
|
|
|
|
* to be a scattered collection of stuff in access/genam.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* old comments
|
1997-09-07 07:04:48 +02:00
|
|
|
* Scans are implemented as follows:
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* `0' represents an invalid item pointer.
|
|
|
|
* `-' represents an unknown item pointer.
|
|
|
|
* `X' represents a known item pointers.
|
|
|
|
* `+' represents known or invalid item pointers.
|
|
|
|
* `*' represents any item pointers.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* State is represented by a triple of these symbols in the order of
|
|
|
|
* previous, current, next. Note that the case of reverse scans works
|
|
|
|
* identically.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* State Result
|
|
|
|
* (1) + + - + 0 0 (if the next item pointer is invalid)
|
|
|
|
* (2) + X - (otherwise)
|
|
|
|
* (3) * 0 0 * 0 0 (no change)
|
|
|
|
* (4) + X 0 X 0 0 (shift)
|
|
|
|
* (5) * + X + X - (shift, add unknown)
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* All other states cannot occur.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
1997-09-07 07:04:48 +02:00
|
|
|
* Note: It would be possible to cache the status of the previous and
|
|
|
|
* next item pointer using the flags.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "postgres.h"
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "access/relscan.h"
|
2007-09-20 19:56:33 +02:00
|
|
|
#include "access/transam.h"
|
2011-04-17 01:29:10 +02:00
|
|
|
#include "catalog/index.h"
|
2005-10-06 04:29:23 +02:00
|
|
|
#include "pgstat.h"
|
2008-05-12 02:00:54 +02:00
|
|
|
#include "storage/bufmgr.h"
|
|
|
|
#include "storage/lmgr.h"
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
#include "storage/predicate.h"
|
2008-03-26 19:48:59 +01:00
|
|
|
#include "utils/snapmgr.h"
|
2008-03-26 22:10:39 +01:00
|
|
|
#include "utils/tqual.h"
|
1996-10-21 09:38:20 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
|
|
|
|
/* ----------------------------------------------------------------
|
1997-09-07 07:04:48 +02:00
|
|
|
* macros used in index_ routines
|
2011-04-17 01:29:10 +02:00
|
|
|
*
|
|
|
|
* Note: the ReindexIsProcessingIndex() check in RELATION_CHECKS is there
|
|
|
|
* to check that we don't try to scan or do retail insertions into an index
|
2011-06-09 20:32:50 +02:00
|
|
|
* that is currently being rebuilt or pending rebuild. This helps to catch
|
2011-04-17 01:29:10 +02:00
|
|
|
* things that don't work when reindexing system catalogs. The assertion
|
|
|
|
* doesn't prevent the actual rebuild because we don't use RELATION_CHECKS
|
|
|
|
* when calling the index AM's ambuild routine, and there is no reason for
|
|
|
|
* ambuild to call its subsidiary routines through this file.
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#define RELATION_CHECKS \
|
1998-06-15 20:40:05 +02:00
|
|
|
( \
|
2002-05-21 01:51:44 +02:00
|
|
|
AssertMacro(RelationIsValid(indexRelation)), \
|
2011-04-17 01:29:10 +02:00
|
|
|
AssertMacro(PointerIsValid(indexRelation->rd_am)), \
|
|
|
|
AssertMacro(!ReindexIsProcessingIndex(RelationGetRelid(indexRelation))) \
|
1998-06-15 20:40:05 +02:00
|
|
|
)
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
#define SCAN_CHECKS \
|
1998-06-15 20:40:05 +02:00
|
|
|
( \
|
|
|
|
AssertMacro(IndexScanIsValid(scan)), \
|
2002-05-21 01:51:44 +02:00
|
|
|
AssertMacro(RelationIsValid(scan->indexRelation)), \
|
|
|
|
AssertMacro(PointerIsValid(scan->indexRelation->rd_am)) \
|
1998-06-15 20:40:05 +02:00
|
|
|
)
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2005-03-28 01:53:05 +02:00
|
|
|
#define GET_REL_PROCEDURE(pname) \
|
2005-05-28 01:31:21 +02:00
|
|
|
do { \
|
|
|
|
procedure = &indexRelation->rd_aminfo->pname; \
|
|
|
|
if (!OidIsValid(procedure->fn_oid)) \
|
|
|
|
{ \
|
|
|
|
RegProcedure procOid = indexRelation->rd_am->pname; \
|
|
|
|
if (!RegProcedureIsValid(procOid)) \
|
|
|
|
elog(ERROR, "invalid %s regproc", CppAsString(pname)); \
|
|
|
|
fmgr_info_cxt(procOid, procedure, indexRelation->rd_indexcxt); \
|
|
|
|
} \
|
|
|
|
} while(0)
|
1998-09-01 06:40:42 +02:00
|
|
|
|
2005-03-28 01:53:05 +02:00
|
|
|
#define GET_SCAN_PROCEDURE(pname) \
|
2005-05-28 01:31:21 +02:00
|
|
|
do { \
|
|
|
|
procedure = &scan->indexRelation->rd_aminfo->pname; \
|
|
|
|
if (!OidIsValid(procedure->fn_oid)) \
|
|
|
|
{ \
|
|
|
|
RegProcedure procOid = scan->indexRelation->rd_am->pname; \
|
|
|
|
if (!RegProcedureIsValid(procOid)) \
|
|
|
|
elog(ERROR, "invalid %s regproc", CppAsString(pname)); \
|
|
|
|
fmgr_info_cxt(procOid, procedure, scan->indexRelation->rd_indexcxt); \
|
|
|
|
} \
|
|
|
|
} while(0)
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2005-03-28 01:53:05 +02:00
|
|
|
static IndexScanDesc index_beginscan_internal(Relation indexRelation,
|
2011-06-15 10:43:05 +02:00
|
|
|
int nkeys, int norderbys, Snapshot snapshot);
|
2005-03-28 01:53:05 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* ----------------------------------------------------------------
|
1997-09-07 07:04:48 +02:00
|
|
|
* index_ interface functions
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------------------------------------------------------
|
|
|
|
*/
|
2001-11-02 17:30:29 +01:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* ----------------
|
2001-11-02 17:30:29 +01:00
|
|
|
* index_open - open an index relation by relation OID
|
1999-09-18 21:08:25 +02:00
|
|
|
*
|
2006-07-31 22:09:10 +02:00
|
|
|
* If lockmode is not "NoLock", the specified kind of lock is
|
2006-10-04 02:30:14 +02:00
|
|
|
* obtained on the index. (Generally, NoLock should only be
|
2006-07-31 22:09:10 +02:00
|
|
|
* used if the caller knows it has some appropriate lock on the
|
|
|
|
* index already.)
|
|
|
|
*
|
|
|
|
* An error is raised if the index does not exist.
|
2001-11-02 17:30:29 +01:00
|
|
|
*
|
|
|
|
* This is a convenience routine adapted for indexscan use.
|
|
|
|
* Some callers may prefer to use relation_open directly.
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
Relation
|
2006-07-31 22:09:10 +02:00
|
|
|
index_open(Oid relationId, LOCKMODE lockmode)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1999-09-18 21:08:25 +02:00
|
|
|
Relation r;
|
|
|
|
|
2006-07-31 22:09:10 +02:00
|
|
|
r = relation_open(relationId, lockmode);
|
1999-09-18 21:08:25 +02:00
|
|
|
|
|
|
|
if (r->rd_rel->relkind != RELKIND_INDEX)
|
2003-07-21 22:29:40 +02:00
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
2003-09-25 08:58:07 +02:00
|
|
|
errmsg("\"%s\" is not an index",
|
2003-07-21 22:29:40 +02:00
|
|
|
RelationGetRelationName(r))));
|
1999-09-18 21:08:25 +02:00
|
|
|
|
|
|
|
return r;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
2006-07-31 22:09:10 +02:00
|
|
|
* index_close - close an index relation
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2006-07-31 22:09:10 +02:00
|
|
|
* If lockmode is not "NoLock", we then release the specified lock.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2006-07-31 22:09:10 +02:00
|
|
|
* Note that it is often sensible to hold a lock beyond index_close;
|
|
|
|
* in that case, the lock is released automatically at xact end.
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
void
|
2006-07-31 22:09:10 +02:00
|
|
|
index_close(Relation relation, LOCKMODE lockmode)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2006-07-31 22:09:10 +02:00
|
|
|
LockRelId relid = relation->rd_lockInfo.lockRelId;
|
|
|
|
|
|
|
|
Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
|
|
|
|
|
|
|
|
/* The relcache does the real work... */
|
1997-09-07 07:04:48 +02:00
|
|
|
RelationClose(relation);
|
2006-07-31 22:09:10 +02:00
|
|
|
|
|
|
|
if (lockmode != NoLock)
|
|
|
|
UnlockRelationId(&relid, lockmode);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
1997-09-07 07:04:48 +02:00
|
|
|
* index_insert - insert an index tuple into a relation
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
2005-03-21 02:24:04 +01:00
|
|
|
bool
|
2002-05-21 01:51:44 +02:00
|
|
|
index_insert(Relation indexRelation,
|
2005-03-21 02:24:04 +01:00
|
|
|
Datum *values,
|
|
|
|
bool *isnull,
|
1997-09-07 07:04:48 +02:00
|
|
|
ItemPointer heap_t_ctid,
|
2002-05-24 20:57:57 +02:00
|
|
|
Relation heapRelation,
|
2009-07-29 22:56:21 +02:00
|
|
|
IndexUniqueCheck checkUnique)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
RELATION_CHECKS;
|
2005-03-28 01:53:05 +02:00
|
|
|
GET_REL_PROCEDURE(aminsert);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
if (!(indexRelation->rd_am->ampredlocks))
|
|
|
|
CheckForSerializableConflictIn(indexRelation,
|
|
|
|
(HeapTuple) NULL,
|
|
|
|
InvalidBuffer);
|
|
|
|
|
2001-03-22 07:16:21 +01:00
|
|
|
/*
|
|
|
|
* have the am's insert proc do all the work.
|
1997-09-07 07:04:48 +02:00
|
|
|
*/
|
2005-05-28 01:31:21 +02:00
|
|
|
return DatumGetBool(FunctionCall6(procedure,
|
|
|
|
PointerGetDatum(indexRelation),
|
|
|
|
PointerGetDatum(values),
|
|
|
|
PointerGetDatum(isnull),
|
|
|
|
PointerGetDatum(heap_t_ctid),
|
|
|
|
PointerGetDatum(heapRelation),
|
2009-07-29 22:56:21 +02:00
|
|
|
Int32GetDatum((int32) checkUnique)));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
2005-03-28 01:53:05 +02:00
|
|
|
/*
|
|
|
|
* index_beginscan - start a scan of an index with amgettuple
|
2002-05-21 01:51:44 +02:00
|
|
|
*
|
2008-04-13 01:14:21 +02:00
|
|
|
* Caller must be holding suitable locks on the heap and the index.
|
1996-07-09 08:22:35 +02:00
|
|
|
*/
|
|
|
|
IndexScanDesc
|
2002-05-21 01:51:44 +02:00
|
|
|
index_beginscan(Relation heapRelation,
|
|
|
|
Relation indexRelation,
|
|
|
|
Snapshot snapshot,
|
2010-12-03 02:50:48 +01:00
|
|
|
int nkeys, int norderbys)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2001-06-01 04:41:36 +02:00
|
|
|
IndexScanDesc scan;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-06-15 10:43:05 +02:00
|
|
|
scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot);
|
2005-03-28 01:53:05 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Save additional parameters into the scandesc. Everything else was set
|
|
|
|
* up by RelationGetIndexScan.
|
2005-03-28 01:53:05 +02:00
|
|
|
*/
|
|
|
|
scan->heapRelation = heapRelation;
|
|
|
|
scan->xs_snapshot = snapshot;
|
|
|
|
|
|
|
|
return scan;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-04-11 00:25:26 +02:00
|
|
|
* index_beginscan_bitmap - start a scan of an index with amgetbitmap
|
2005-03-28 01:53:05 +02:00
|
|
|
*
|
|
|
|
* As above, caller had better be holding some lock on the parent heap
|
|
|
|
* relation, even though it's not explicitly mentioned here.
|
|
|
|
*/
|
|
|
|
IndexScanDesc
|
2008-04-11 00:25:26 +02:00
|
|
|
index_beginscan_bitmap(Relation indexRelation,
|
|
|
|
Snapshot snapshot,
|
2010-12-03 02:50:48 +01:00
|
|
|
int nkeys)
|
2005-03-28 01:53:05 +02:00
|
|
|
{
|
|
|
|
IndexScanDesc scan;
|
|
|
|
|
2011-06-15 10:43:05 +02:00
|
|
|
scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot);
|
2005-03-28 01:53:05 +02:00
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Save additional parameters into the scandesc. Everything else was set
|
|
|
|
* up by RelationGetIndexScan.
|
2005-03-28 01:53:05 +02:00
|
|
|
*/
|
|
|
|
scan->xs_snapshot = snapshot;
|
|
|
|
|
|
|
|
return scan;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* index_beginscan_internal --- common code for index_beginscan variants
|
|
|
|
*/
|
|
|
|
static IndexScanDesc
|
|
|
|
index_beginscan_internal(Relation indexRelation,
|
2011-06-15 10:43:05 +02:00
|
|
|
int nkeys, int norderbys, Snapshot snapshot)
|
2005-03-28 01:53:05 +02:00
|
|
|
{
|
|
|
|
IndexScanDesc scan;
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
2005-03-28 01:53:05 +02:00
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
RELATION_CHECKS;
|
2006-07-31 22:09:10 +02:00
|
|
|
GET_REL_PROCEDURE(ambeginscan);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
if (!(indexRelation->rd_am->ampredlocks))
|
2011-06-15 10:43:05 +02:00
|
|
|
PredicateLockRelation(indexRelation, snapshot);
|
Implement genuine serializable isolation level.
Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.
To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.
A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.
Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.
We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.
Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.
Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
2011-02-07 22:46:51 +01:00
|
|
|
|
2005-06-27 14:45:23 +02:00
|
|
|
/*
|
2006-07-31 22:09:10 +02:00
|
|
|
* We hold a reference count to the relcache entry throughout the scan.
|
2005-06-27 14:45:23 +02:00
|
|
|
*/
|
2006-07-31 22:09:10 +02:00
|
|
|
RelationIncrementReferenceCount(indexRelation);
|
2005-06-27 14:45:23 +02:00
|
|
|
|
2002-05-21 01:51:44 +02:00
|
|
|
/*
|
|
|
|
* Tell the AM to open a scan.
|
|
|
|
*/
|
2001-06-01 04:41:36 +02:00
|
|
|
scan = (IndexScanDesc)
|
2005-05-28 01:31:21 +02:00
|
|
|
DatumGetPointer(FunctionCall3(procedure,
|
|
|
|
PointerGetDatum(indexRelation),
|
|
|
|
Int32GetDatum(nkeys),
|
2010-12-03 02:50:48 +01:00
|
|
|
Int32GetDatum(norderbys)));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-06-01 04:41:36 +02:00
|
|
|
return scan;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
2002-05-21 01:51:44 +02:00
|
|
|
* index_rescan - (re)start a scan of an index
|
|
|
|
*
|
2010-12-03 02:50:48 +01:00
|
|
|
* During a restart, the caller may specify a new set of scankeys and/or
|
|
|
|
* orderbykeys; but the number of keys cannot differ from what index_beginscan
|
|
|
|
* was told. (Later we might relax that to "must not exceed", but currently
|
|
|
|
* the index AMs tend to assume that scan->numberOfKeys is what to believe.)
|
|
|
|
* To restart the scan without changing keys, pass NULL for the key arrays.
|
|
|
|
* (Of course, keys *must* be passed on the first call, unless
|
|
|
|
* scan->numberOfKeys is zero.)
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
void
|
2010-12-03 02:50:48 +01:00
|
|
|
index_rescan(IndexScanDesc scan,
|
|
|
|
ScanKey keys, int nkeys,
|
|
|
|
ScanKey orderbys, int norderbys)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
SCAN_CHECKS;
|
2005-03-28 01:53:05 +02:00
|
|
|
GET_SCAN_PROCEDURE(amrescan);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2010-12-03 02:50:48 +01:00
|
|
|
Assert(nkeys == scan->numberOfKeys);
|
|
|
|
Assert(norderbys == scan->numberOfOrderBys);
|
|
|
|
|
2004-04-21 20:24:26 +02:00
|
|
|
/* Release any held pin on a heap page */
|
|
|
|
if (BufferIsValid(scan->xs_cbuf))
|
|
|
|
{
|
|
|
|
ReleaseBuffer(scan->xs_cbuf);
|
|
|
|
scan->xs_cbuf = InvalidBuffer;
|
|
|
|
}
|
|
|
|
|
2011-06-27 16:27:17 +02:00
|
|
|
scan->xs_continue_hot = false;
|
2007-09-20 19:56:33 +02:00
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
scan->kill_prior_tuple = false; /* for safety */
|
2002-05-24 20:57:57 +02:00
|
|
|
|
2010-12-03 02:50:48 +01:00
|
|
|
FunctionCall5(procedure,
|
2005-05-28 01:31:21 +02:00
|
|
|
PointerGetDatum(scan),
|
2010-12-03 02:50:48 +01:00
|
|
|
PointerGetDatum(keys),
|
|
|
|
Int32GetDatum(nkeys),
|
|
|
|
PointerGetDatum(orderbys),
|
|
|
|
Int32GetDatum(norderbys));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
1997-09-07 07:04:48 +02:00
|
|
|
* index_endscan - end a scan
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
index_endscan(IndexScanDesc scan)
|
|
|
|
{
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
SCAN_CHECKS;
|
2005-03-28 01:53:05 +02:00
|
|
|
GET_SCAN_PROCEDURE(amendscan);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2002-05-21 01:51:44 +02:00
|
|
|
/* Release any held pin on a heap page */
|
|
|
|
if (BufferIsValid(scan->xs_cbuf))
|
|
|
|
{
|
|
|
|
ReleaseBuffer(scan->xs_cbuf);
|
|
|
|
scan->xs_cbuf = InvalidBuffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* End the AM's scan */
|
2005-05-28 01:31:21 +02:00
|
|
|
FunctionCall1(procedure, PointerGetDatum(scan));
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2006-07-31 22:09:10 +02:00
|
|
|
/* Release index refcount acquired by index_beginscan */
|
2002-05-21 01:51:44 +02:00
|
|
|
RelationDecrementReferenceCount(scan->indexRelation);
|
1999-12-30 06:05:13 +01:00
|
|
|
|
|
|
|
/* Release the scan data structure itself */
|
|
|
|
IndexScanEnd(scan);
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
1997-09-07 07:04:48 +02:00
|
|
|
* index_markpos - mark a scan position
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
index_markpos(IndexScanDesc scan)
|
|
|
|
{
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
|
|
|
SCAN_CHECKS;
|
2005-03-28 01:53:05 +02:00
|
|
|
GET_SCAN_PROCEDURE(ammarkpos);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2005-05-28 01:31:21 +02:00
|
|
|
FunctionCall1(procedure, PointerGetDatum(scan));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* ----------------
|
1997-09-07 07:04:48 +02:00
|
|
|
* index_restrpos - restore a scan position
|
2005-05-15 23:19:55 +02:00
|
|
|
*
|
|
|
|
* NOTE: this only restores the internal scan state of the index AM.
|
|
|
|
* The current result tuple (scan->xs_ctup) doesn't change. See comments
|
|
|
|
* for ExecRestrPos().
|
2007-09-20 19:56:33 +02:00
|
|
|
*
|
|
|
|
* NOTE: in the presence of HOT chains, mark/restore only works correctly
|
|
|
|
* if the scan's snapshot is MVCC-safe; that ensures that there's at most one
|
|
|
|
* returnable tuple in each HOT chain, and so restoring the prior state at the
|
|
|
|
* granularity of the index AM is sufficient. Since the only current user
|
|
|
|
* of mark/restore functionality is nodeMergejoin.c, this effectively means
|
2007-11-15 22:14:46 +01:00
|
|
|
* that merge-join plans only work for MVCC snapshots. This could be fixed
|
2007-09-20 19:56:33 +02:00
|
|
|
* if necessary, but for now it seems unimportant.
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
index_restrpos(IndexScanDesc scan)
|
|
|
|
{
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2007-09-20 19:56:33 +02:00
|
|
|
Assert(IsMVCCSnapshot(scan->xs_snapshot));
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
SCAN_CHECKS;
|
2005-03-28 01:53:05 +02:00
|
|
|
GET_SCAN_PROCEDURE(amrestrpos);
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-06-27 16:27:17 +02:00
|
|
|
scan->xs_continue_hot = false;
|
2007-09-20 19:56:33 +02:00
|
|
|
|
2002-09-04 22:31:48 +02:00
|
|
|
scan->kill_prior_tuple = false; /* for safety */
|
2003-01-08 20:41:40 +01:00
|
|
|
|
2005-05-28 01:31:21 +02:00
|
|
|
FunctionCall1(procedure, PointerGetDatum(scan));
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|
1997-09-07 07:04:48 +02:00
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* ----------------
|
2011-10-08 02:13:02 +02:00
|
|
|
* index_getnext_tid - get the next TID from a scan
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2011-10-08 02:13:02 +02:00
|
|
|
* The result is the next TID satisfying the scan keys,
|
|
|
|
* or NULL if no more matching tuples exist.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
ItemPointer
|
|
|
|
index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
|
|
|
|
{
|
|
|
|
FmgrInfo *procedure;
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
SCAN_CHECKS;
|
|
|
|
GET_SCAN_PROCEDURE(amgettuple);
|
|
|
|
|
|
|
|
Assert(TransactionIdIsValid(RecentGlobalXmin));
|
|
|
|
|
|
|
|
/*
|
2011-10-09 06:21:08 +02:00
|
|
|
* The AM's amgettuple proc finds the next index entry matching the scan
|
|
|
|
* keys, and puts the TID into scan->xs_ctup.t_self. It should also set
|
|
|
|
* scan->xs_recheck and possibly scan->xs_itup, though we pay no attention
|
|
|
|
* to those fields here.
|
2011-10-08 02:13:02 +02:00
|
|
|
*/
|
|
|
|
found = DatumGetBool(FunctionCall2(procedure,
|
|
|
|
PointerGetDatum(scan),
|
|
|
|
Int32GetDatum(direction)));
|
|
|
|
|
|
|
|
/* Reset kill flag immediately for safety */
|
|
|
|
scan->kill_prior_tuple = false;
|
|
|
|
|
|
|
|
/* If we're out of index entries, we're done */
|
|
|
|
if (!found)
|
|
|
|
{
|
|
|
|
/* ... but first, release any held pin on a heap page */
|
|
|
|
if (BufferIsValid(scan->xs_cbuf))
|
|
|
|
{
|
|
|
|
ReleaseBuffer(scan->xs_cbuf);
|
|
|
|
scan->xs_cbuf = InvalidBuffer;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
pgstat_count_index_tuples(scan->indexRelation, 1);
|
|
|
|
|
|
|
|
/* Return the TID of the tuple we found. */
|
|
|
|
return &scan->xs_ctup.t_self;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* index_fetch_heap - get the scan's next heap tuple
|
|
|
|
*
|
|
|
|
* The result is a visible heap tuple associated with the index TID most
|
|
|
|
* recently fetched by index_getnext_tid, or NULL if no more matching tuples
|
|
|
|
* exist. (There can be more than one matching tuple because of HOT chains,
|
|
|
|
* although when using an MVCC snapshot it should be impossible for more than
|
|
|
|
* one such tuple to exist.)
|
|
|
|
*
|
|
|
|
* On success, the buffer containing the heap tup is pinned (the pin will be
|
|
|
|
* dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
|
|
|
|
* call).
|
2008-04-13 21:18:14 +02:00
|
|
|
*
|
|
|
|
* Note: caller must check scan->xs_recheck, and perform rechecking of the
|
|
|
|
* scan keys if required. We do not do that here because we don't have
|
|
|
|
* enough information to do it efficiently in the general case.
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
2002-05-21 01:51:44 +02:00
|
|
|
HeapTuple
|
2011-10-08 02:13:02 +02:00
|
|
|
index_fetch_heap(IndexScanDesc scan)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
2011-10-08 02:13:02 +02:00
|
|
|
ItemPointer tid = &scan->xs_ctup.t_self;
|
2011-06-27 16:27:17 +02:00
|
|
|
bool all_dead = false;
|
2011-10-08 02:13:02 +02:00
|
|
|
bool got_heap_tuple;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2011-10-08 02:13:02 +02:00
|
|
|
/* We can skip the buffer-switching logic if we're in mid-HOT chain. */
|
|
|
|
if (!scan->xs_continue_hot)
|
|
|
|
{
|
|
|
|
/* Switch to correct buffer if we don't have it already */
|
|
|
|
Buffer prev_buf = scan->xs_cbuf;
|
2000-03-15 00:52:01 +01:00
|
|
|
|
2011-10-08 02:13:02 +02:00
|
|
|
scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf,
|
|
|
|
scan->heapRelation,
|
|
|
|
ItemPointerGetBlockNumber(tid));
|
2008-09-11 16:01:10 +02:00
|
|
|
|
2011-10-08 02:13:02 +02:00
|
|
|
/*
|
|
|
|
* Prune page, but only if we weren't already on this page
|
|
|
|
*/
|
|
|
|
if (prev_buf != scan->xs_cbuf)
|
|
|
|
heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf,
|
|
|
|
RecentGlobalXmin);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Obtain share-lock on the buffer so we can examine visibility */
|
|
|
|
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE);
|
|
|
|
got_heap_tuple = heap_hot_search_buffer(tid, scan->heapRelation,
|
|
|
|
scan->xs_cbuf,
|
|
|
|
scan->xs_snapshot,
|
|
|
|
&scan->xs_ctup,
|
|
|
|
&all_dead,
|
|
|
|
!scan->xs_continue_hot);
|
|
|
|
LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK);
|
|
|
|
|
|
|
|
if (got_heap_tuple)
|
2002-05-21 01:51:44 +02:00
|
|
|
{
|
2011-10-08 02:13:02 +02:00
|
|
|
/*
|
|
|
|
* Only in a non-MVCC snapshot can more than one member of the
|
|
|
|
* HOT chain be visible.
|
|
|
|
*/
|
|
|
|
scan->xs_continue_hot = !IsMVCCSnapshot(scan->xs_snapshot);
|
|
|
|
pgstat_count_heap_fetch(scan->indexRelation);
|
|
|
|
return &scan->xs_ctup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We've reached the end of the HOT chain. */
|
|
|
|
scan->xs_continue_hot = false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we scanned a whole HOT chain and found only dead tuples, tell index
|
|
|
|
* AM to kill its entry for that TID (this will take effect in the next
|
|
|
|
* amgettuple call, in index_getnext_tid). We do not do this when in
|
|
|
|
* recovery because it may violate MVCC to do so. See comments in
|
|
|
|
* RelationGetIndexScan().
|
|
|
|
*/
|
|
|
|
if (!scan->xactStartedInRecovery)
|
|
|
|
scan->kill_prior_tuple = all_dead;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* index_getnext - get the next heap tuple from a scan
|
|
|
|
*
|
|
|
|
* The result is the next heap tuple satisfying the scan keys and the
|
|
|
|
* snapshot, or NULL if no more matching tuples exist.
|
|
|
|
*
|
|
|
|
* On success, the buffer containing the heap tup is pinned (the pin will be
|
|
|
|
* dropped in a future index_getnext_tid, index_fetch_heap or index_endscan
|
|
|
|
* call).
|
|
|
|
*
|
|
|
|
* Note: caller must check scan->xs_recheck, and perform rechecking of the
|
|
|
|
* scan keys if required. We do not do that here because we don't have
|
|
|
|
* enough information to do it efficiently in the general case.
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
HeapTuple
|
|
|
|
index_getnext(IndexScanDesc scan, ScanDirection direction)
|
|
|
|
{
|
|
|
|
HeapTuple heapTuple;
|
|
|
|
ItemPointer tid;
|
2002-05-24 20:57:57 +02:00
|
|
|
|
2011-10-08 02:13:02 +02:00
|
|
|
for (;;)
|
|
|
|
{
|
2011-06-27 16:27:17 +02:00
|
|
|
if (scan->xs_continue_hot)
|
2007-09-20 19:56:33 +02:00
|
|
|
{
|
|
|
|
/*
|
2007-11-15 22:14:46 +01:00
|
|
|
* We are resuming scan of a HOT chain after having returned an
|
|
|
|
* earlier member. Must still hold pin on current heap page.
|
2007-09-20 19:56:33 +02:00
|
|
|
*/
|
|
|
|
Assert(BufferIsValid(scan->xs_cbuf));
|
2011-10-08 02:13:02 +02:00
|
|
|
Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) ==
|
2007-09-20 19:56:33 +02:00
|
|
|
BufferGetBlockNumber(scan->xs_cbuf));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2011-10-08 02:13:02 +02:00
|
|
|
/* Time to fetch the next TID from the index */
|
|
|
|
tid = index_getnext_tid(scan, direction);
|
2007-09-20 19:56:33 +02:00
|
|
|
|
2011-10-08 02:13:02 +02:00
|
|
|
/* If we're out of index entries, we're done */
|
|
|
|
if (tid == NULL)
|
2007-09-20 19:56:33 +02:00
|
|
|
break;
|
|
|
|
}
|
2002-05-24 20:57:57 +02:00
|
|
|
|
2011-10-08 02:13:02 +02:00
|
|
|
/*
|
|
|
|
* Fetch the next (or only) visible heap tuple for this index entry.
|
|
|
|
* If we don't find anything, loop around and grab the next TID from
|
|
|
|
* the index.
|
|
|
|
*/
|
|
|
|
heapTuple = index_fetch_heap(scan);
|
|
|
|
if (heapTuple != NULL)
|
2011-06-27 16:27:17 +02:00
|
|
|
return heapTuple;
|
2002-05-21 01:51:44 +02:00
|
|
|
}
|
|
|
|
|
2007-09-20 19:56:33 +02:00
|
|
|
return NULL; /* failure exit */
|
2002-05-21 01:51:44 +02:00
|
|
|
}
|
|
|
|
|
2005-03-28 01:53:05 +02:00
|
|
|
/* ----------------
|
2008-04-11 00:25:26 +02:00
|
|
|
* index_getbitmap - get all tuples at once from an index scan
|
2005-03-28 01:53:05 +02:00
|
|
|
*
|
2008-04-11 00:25:26 +02:00
|
|
|
* Adds the TIDs of all heap tuples satisfying the scan keys to a bitmap.
|
2005-03-28 01:53:05 +02:00
|
|
|
* Since there's no interlock between the index scan and the eventual heap
|
|
|
|
* access, this is only safe to use with MVCC-based snapshots: the heap
|
|
|
|
* item slot could have been replaced by a newer tuple by the time we get
|
|
|
|
* to it.
|
|
|
|
*
|
2009-03-24 21:17:18 +01:00
|
|
|
* Returns the number of matching tuples found. (Note: this might be only
|
|
|
|
* approximate, so it should only be used for statistical purposes.)
|
2005-03-28 01:53:05 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
2008-04-11 00:25:26 +02:00
|
|
|
int64
|
|
|
|
index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
|
2005-03-28 01:53:05 +02:00
|
|
|
{
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
2008-04-11 00:25:26 +02:00
|
|
|
int64 ntids;
|
2008-10-10 16:17:08 +02:00
|
|
|
Datum d;
|
2005-03-28 01:53:05 +02:00
|
|
|
|
|
|
|
SCAN_CHECKS;
|
2008-04-11 00:25:26 +02:00
|
|
|
GET_SCAN_PROCEDURE(amgetbitmap);
|
2005-03-28 01:53:05 +02:00
|
|
|
|
|
|
|
/* just make sure this is false... */
|
|
|
|
scan->kill_prior_tuple = false;
|
|
|
|
|
|
|
|
/*
|
2008-04-11 00:25:26 +02:00
|
|
|
* have the am's getbitmap proc do all the work.
|
2005-03-28 01:53:05 +02:00
|
|
|
*/
|
2008-10-10 16:17:08 +02:00
|
|
|
d = FunctionCall2(procedure,
|
|
|
|
PointerGetDatum(scan),
|
|
|
|
PointerGetDatum(bitmap));
|
|
|
|
|
|
|
|
ntids = DatumGetInt64(d);
|
|
|
|
|
|
|
|
/* If int8 is pass-by-ref, must free the result to avoid memory leak */
|
|
|
|
#ifndef USE_FLOAT8_BYVAL
|
|
|
|
pfree(DatumGetPointer(d));
|
|
|
|
#endif
|
2005-03-28 01:53:05 +02:00
|
|
|
|
2008-04-11 00:25:26 +02:00
|
|
|
pgstat_count_index_tuples(scan->indexRelation, ntids);
|
2005-10-06 04:29:23 +02:00
|
|
|
|
2008-04-11 00:25:26 +02:00
|
|
|
return ntids;
|
2005-03-28 01:53:05 +02:00
|
|
|
}
|
|
|
|
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
/* ----------------
|
|
|
|
* index_bulk_delete - do mass deletion of index entries
|
|
|
|
*
|
|
|
|
* callback routine tells whether a given main-heap tuple is
|
|
|
|
* to be deleted
|
2006-10-04 02:30:14 +02:00
|
|
|
*
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
* return value is an optional palloc'd struct of statistics
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
IndexBulkDeleteResult *
|
2006-05-03 00:25:10 +02:00
|
|
|
index_bulk_delete(IndexVacuumInfo *info,
|
|
|
|
IndexBulkDeleteResult *stats,
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
IndexBulkDeleteCallback callback,
|
|
|
|
void *callback_state)
|
|
|
|
{
|
2006-05-03 00:25:10 +02:00
|
|
|
Relation indexRelation = info->index;
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
IndexBulkDeleteResult *result;
|
|
|
|
|
|
|
|
RELATION_CHECKS;
|
2005-03-28 01:53:05 +02:00
|
|
|
GET_REL_PROCEDURE(ambulkdelete);
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
|
|
|
|
result = (IndexBulkDeleteResult *)
|
2006-05-03 00:25:10 +02:00
|
|
|
DatumGetPointer(FunctionCall4(procedure,
|
|
|
|
PointerGetDatum(info),
|
|
|
|
PointerGetDatum(stats),
|
2005-05-28 01:31:21 +02:00
|
|
|
PointerGetDatum((Pointer) callback),
|
|
|
|
PointerGetDatum(callback_state)));
|
Restructure index AM interface for index building and index tuple deletion,
per previous discussion on pghackers. Most of the duplicate code in
different AMs' ambuild routines has been moved out to a common routine
in index.c; this means that all index types now do the right things about
inserting recently-dead tuples, etc. (I also removed support for EXTEND
INDEX in the ambuild routines, since that's about to go away anyway, and
it cluttered the code a lot.) The retail indextuple deletion routines have
been replaced by a "bulk delete" routine in which the indexscan is inside
the access method. I haven't pushed this change as far as it should go yet,
but it should allow considerable simplification of the internal bookkeeping
for deletions. Also, add flag columns to pg_am to eliminate various
hardcoded tests on AM OIDs, and remove unused pg_am columns.
Fix rtree and gist index types to not attempt to store NULLs; before this,
gist usually crashed, while rtree managed not to crash but computed wacko
bounding boxes for NULL entries (which might have had something to do with
the performance problems we've heard about occasionally).
Add AtEOXact routines to hash, rtree, and gist, all of which have static
state that needs to be reset after an error. We discovered this need long
ago for btree, but missed the other guys.
Oh, one more thing: concurrent VACUUM is now the default.
2001-07-16 00:48:19 +02:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2003-02-22 01:45:05 +01:00
|
|
|
/* ----------------
|
|
|
|
* index_vacuum_cleanup - do post-deletion cleanup of an index
|
|
|
|
*
|
|
|
|
* return value is an optional palloc'd struct of statistics
|
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
IndexBulkDeleteResult *
|
2006-05-03 00:25:10 +02:00
|
|
|
index_vacuum_cleanup(IndexVacuumInfo *info,
|
2003-02-22 01:45:05 +01:00
|
|
|
IndexBulkDeleteResult *stats)
|
|
|
|
{
|
2006-05-03 00:25:10 +02:00
|
|
|
Relation indexRelation = info->index;
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *procedure;
|
2003-02-22 01:45:05 +01:00
|
|
|
IndexBulkDeleteResult *result;
|
|
|
|
|
|
|
|
RELATION_CHECKS;
|
2005-03-28 01:53:05 +02:00
|
|
|
GET_REL_PROCEDURE(amvacuumcleanup);
|
2003-02-22 01:45:05 +01:00
|
|
|
|
|
|
|
result = (IndexBulkDeleteResult *)
|
2006-05-03 00:25:10 +02:00
|
|
|
DatumGetPointer(FunctionCall2(procedure,
|
|
|
|
PointerGetDatum(info),
|
|
|
|
PointerGetDatum(stats)));
|
2003-02-22 01:45:05 +01:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
1996-07-09 08:22:35 +02:00
|
|
|
/* ----------------
|
1997-09-07 07:04:48 +02:00
|
|
|
* index_getprocid
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2006-12-23 01:43:13 +01:00
|
|
|
* Index access methods typically require support routines that are
|
|
|
|
* not directly the implementation of any WHERE-clause query operator
|
|
|
|
* and so cannot be kept in pg_amop. Instead, such routines are kept
|
|
|
|
* in pg_amproc. These registered procedure OIDs are assigned numbers
|
|
|
|
* according to a convention established by the access method.
|
|
|
|
* The general index code doesn't know anything about the routines
|
|
|
|
* involved; it just builds an ordered list of them for
|
1997-09-07 07:04:48 +02:00
|
|
|
* each attribute on which an index is defined.
|
1996-07-09 08:22:35 +02:00
|
|
|
*
|
2006-12-23 01:43:13 +01:00
|
|
|
* As of Postgres 8.3, support routines within an operator family
|
|
|
|
* are further subdivided by the "left type" and "right type" of the
|
|
|
|
* query operator(s) that they support. The "default" functions for a
|
|
|
|
* particular indexed attribute are those with both types equal to
|
|
|
|
* the index opclass' opcintype (note that this is subtly different
|
|
|
|
* from the indexed attribute's own type: it may be a binary-compatible
|
2007-11-15 22:14:46 +01:00
|
|
|
* type instead). Only the default functions are stored in relcache
|
2006-12-23 01:43:13 +01:00
|
|
|
* entries --- access methods can use the syscache to look up non-default
|
|
|
|
* functions.
|
|
|
|
*
|
|
|
|
* This routine returns the requested default procedure OID for a
|
|
|
|
* particular indexed attribute.
|
1996-07-09 08:22:35 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
|
|
|
RegProcedure
|
|
|
|
index_getprocid(Relation irel,
|
1997-09-07 07:04:48 +02:00
|
|
|
AttrNumber attnum,
|
|
|
|
uint16 procnum)
|
1996-07-09 08:22:35 +02:00
|
|
|
{
|
1997-09-08 04:41:22 +02:00
|
|
|
RegProcedure *loc;
|
2001-05-31 20:16:55 +02:00
|
|
|
int nproc;
|
2001-10-07 01:21:45 +02:00
|
|
|
int procindex;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-05-31 20:16:55 +02:00
|
|
|
nproc = irel->rd_am->amsupport;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2001-10-07 01:21:45 +02:00
|
|
|
Assert(procnum > 0 && procnum <= (uint16) nproc);
|
|
|
|
|
|
|
|
procindex = (nproc * (attnum - 1)) + (procnum - 1);
|
|
|
|
|
1997-09-07 07:04:48 +02:00
|
|
|
loc = irel->rd_support;
|
|
|
|
|
|
|
|
Assert(loc != NULL);
|
|
|
|
|
2001-10-07 01:21:45 +02:00
|
|
|
return loc[procindex];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ----------------
|
|
|
|
* index_getprocinfo
|
|
|
|
*
|
|
|
|
* This routine allows index AMs to keep fmgr lookup info for
|
2007-11-15 22:14:46 +01:00
|
|
|
* support procs in the relcache. As above, only the "default"
|
2006-12-23 01:43:13 +01:00
|
|
|
* functions for any particular indexed attribute are cached.
|
2005-05-28 01:31:21 +02:00
|
|
|
*
|
|
|
|
* Note: the return value points into cached data that will be lost during
|
|
|
|
* any relcache rebuild! Therefore, either use the callinfo right away,
|
|
|
|
* or save it only after having acquired some type of lock on the index rel.
|
2001-10-07 01:21:45 +02:00
|
|
|
* ----------------
|
|
|
|
*/
|
2005-05-28 01:31:21 +02:00
|
|
|
FmgrInfo *
|
2001-10-07 01:21:45 +02:00
|
|
|
index_getprocinfo(Relation irel,
|
|
|
|
AttrNumber attnum,
|
|
|
|
uint16 procnum)
|
|
|
|
{
|
|
|
|
FmgrInfo *locinfo;
|
|
|
|
int nproc;
|
|
|
|
int procindex;
|
|
|
|
|
|
|
|
nproc = irel->rd_am->amsupport;
|
|
|
|
|
|
|
|
Assert(procnum > 0 && procnum <= (uint16) nproc);
|
|
|
|
|
|
|
|
procindex = (nproc * (attnum - 1)) + (procnum - 1);
|
|
|
|
|
|
|
|
locinfo = irel->rd_supportinfo;
|
|
|
|
|
|
|
|
Assert(locinfo != NULL);
|
|
|
|
|
|
|
|
locinfo += procindex;
|
|
|
|
|
|
|
|
/* Initialize the lookup info if first time through */
|
|
|
|
if (locinfo->fn_oid == InvalidOid)
|
|
|
|
{
|
|
|
|
RegProcedure *loc = irel->rd_support;
|
2002-04-17 22:57:57 +02:00
|
|
|
RegProcedure procId;
|
2001-10-07 01:21:45 +02:00
|
|
|
|
|
|
|
Assert(loc != NULL);
|
|
|
|
|
2002-04-17 22:57:57 +02:00
|
|
|
procId = loc[procindex];
|
|
|
|
|
|
|
|
/*
|
2005-10-15 04:49:52 +02:00
|
|
|
* Complain if function was not found during IndexSupportInitialize.
|
|
|
|
* This should not happen unless the system tables contain bogus
|
|
|
|
* entries for the index opclass. (If an AM wants to allow a support
|
|
|
|
* function to be optional, it can use index_getprocid.)
|
2002-04-17 22:57:57 +02:00
|
|
|
*/
|
|
|
|
if (!RegProcedureIsValid(procId))
|
2003-07-21 22:29:40 +02:00
|
|
|
elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
|
2002-04-17 22:57:57 +02:00
|
|
|
procnum, attnum, RelationGetRelationName(irel));
|
|
|
|
|
|
|
|
fmgr_info_cxt(procId, locinfo, irel->rd_indexcxt);
|
2001-10-07 01:21:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return locinfo;
|
1996-07-09 08:22:35 +02:00
|
|
|
}
|