postgresql/src/include/access/genam.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

238 lines
9.2 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* genam.h
* POSTGRES generalized index access method definitions.
*
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/genam.h
*
*-------------------------------------------------------------------------
*/
#ifndef GENAM_H
#define GENAM_H
1999-07-16 19:07:40 +02:00
#include "access/sdir.h"
#include "access/skey.h"
#include "nodes/tidbitmap.h"
#include "storage/lockdefs.h"
#include "utils/relcache.h"
#include "utils/snapshot.h"
/* We don't want this file to depend on execnodes.h. */
struct IndexInfo;
/*
* Struct for statistics returned by ambuild
*/
typedef struct IndexBuildResult
{
double heap_tuples; /* # of tuples seen in parent table */
double index_tuples; /* # of tuples inserted into index */
} IndexBuildResult;
/*
* Struct for input arguments passed to ambulkdelete and amvacuumcleanup
*
* num_heap_tuples is accurate only when estimated_count is false;
* otherwise it's just an estimate (currently, the estimate is the
Redefine pg_class.reltuples to be -1 before the first VACUUM or ANALYZE. Historically, we've considered the state with relpages and reltuples both zero as indicating that we do not know the table's tuple density. This is problematic because it's impossible to distinguish "never yet vacuumed" from "vacuumed and seen to be empty". In particular, a user cannot use VACUUM or ANALYZE to override the planner's normal heuristic that an empty table should not be believed to be empty because it is probably about to get populated. That heuristic is a good safety measure, so I don't care to abandon it, but there should be a way to override it if the table is indeed intended to stay empty. Hence, represent the initial state of ignorance by setting reltuples to -1 (relpages is still set to zero), and apply the minimum-ten-pages heuristic only when reltuples is still -1. If the table is empty, VACUUM or ANALYZE (but not CREATE INDEX) will override that to reltuples = relpages = 0, and then we'll plan on that basis. This requires a bunch of fiddly little changes, but we can get rid of some ugly kluges that were formerly needed to maintain the old definition. One notable point is that FDWs' GetForeignRelSize methods will see baserel->tuples = -1 when no ANALYZE has been done on the foreign table. That seems like a net improvement, since those methods were formerly also in the dark about what baserel->tuples = 0 really meant. Still, it is an API change. I bumped catversion because code predating this change would get confused by seeing reltuples = -1. Discussion: https://postgr.es/m/F02298E0-6EF4-49A1-BCB6-C484794D9ACC@thebuild.com
2020-08-30 18:21:51 +02:00
* prior value of the relation's pg_class.reltuples field, so it could
* even be -1). It will always just be an estimate during ambulkdelete.
*/
typedef struct IndexVacuumInfo
{
Relation index; /* the index being vacuumed */
Relation heaprel; /* the heap relation the index belongs to */
bool analyze_only; /* ANALYZE (without any actual vacuum) */
bool report_progress; /* emit progress.h status reports */
bool estimated_count; /* num_heap_tuples is an estimate */
int message_level; /* ereport level for progress messages */
double num_heap_tuples; /* tuples remaining in heap */
BufferAccessStrategy strategy; /* access strategy for reads */
} IndexVacuumInfo;
/*
* Struct for statistics returned by ambulkdelete and amvacuumcleanup
*
* This struct is normally allocated by the first ambulkdelete call and then
* passed along through subsequent ones until amvacuumcleanup; however,
* amvacuumcleanup must be prepared to allocate it in the case where no
* ambulkdelete calls were made (because no tuples needed deletion).
* Note that an index AM could choose to return a larger struct
* of which this is just the first field; this provides a way for ambulkdelete
* to communicate additional private data to amvacuumcleanup.
*
* Note: pages_newly_deleted is the number of pages in the index that were
* deleted by the current vacuum operation. pages_deleted and pages_free
* refer to free space within the index file.
*
* Note: Some index AMs may compute num_index_tuples by reference to
* num_heap_tuples, in which case they should copy the estimated_count field
* from IndexVacuumInfo.
*/
typedef struct IndexBulkDeleteResult
{
BlockNumber num_pages; /* pages remaining in index */
bool estimated_count; /* num_index_tuples is an estimate */
double num_index_tuples; /* tuples remaining */
double tuples_removed; /* # removed during vacuum operation */
BlockNumber pages_newly_deleted; /* # pages marked deleted by us */
BlockNumber pages_deleted; /* # pages marked deleted (could be by us) */
BlockNumber pages_free; /* # pages available for reuse */
} IndexBulkDeleteResult;
/* Typedef for callback function to determine if a tuple is bulk-deletable */
typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state);
/* struct definitions appear in relscan.h */
typedef struct IndexScanDescData *IndexScanDesc;
typedef struct SysScanDescData *SysScanDesc;
typedef struct ParallelIndexScanDescData *ParallelIndexScanDesc;
/*
* Enumeration specifying the type of uniqueness check to perform in
* index_insert().
*
* UNIQUE_CHECK_YES is the traditional Postgres immediate check, possibly
* blocking to see if a conflicting transaction commits.
*
* For deferrable unique constraints, UNIQUE_CHECK_PARTIAL is specified at
* insertion time. The index AM should test if the tuple is unique, but
* should not throw error, block, or prevent the insertion if the tuple
* appears not to be unique. We'll recheck later when it is time for the
* constraint to be enforced. The AM must return true if the tuple is
* known unique, false if it is possibly non-unique. In the "true" case
* it is safe to omit the later recheck.
*
* When it is time to recheck the deferred constraint, a pseudo-insertion
* call is made with UNIQUE_CHECK_EXISTING. The tuple is already in the
* index in this case, so it should not be inserted again. Rather, just
* check for conflicting live tuples (possibly blocking).
*/
typedef enum IndexUniqueCheck
{
UNIQUE_CHECK_NO, /* Don't do any uniqueness checking */
UNIQUE_CHECK_YES, /* Enforce uniqueness at insertion time */
UNIQUE_CHECK_PARTIAL, /* Test uniqueness, but no error */
UNIQUE_CHECK_EXISTING, /* Check if existing tuple is unique */
} IndexUniqueCheck;
/* Nullable "ORDER BY col op const" distance */
typedef struct IndexOrderByDistance
{
double value;
bool isnull;
} IndexOrderByDistance;
/*
* generalized index_ interface routines (in indexam.c)
*/
/*
* IndexScanIsValid
* True iff the index scan is valid.
*/
#define IndexScanIsValid(scan) PointerIsValid(scan)
extern Relation index_open(Oid relationId, LOCKMODE lockmode);
extern Relation try_index_open(Oid relationId, LOCKMODE lockmode);
extern void index_close(Relation relation, LOCKMODE lockmode);
extern bool index_insert(Relation indexRelation,
Datum *values, bool *isnull,
ItemPointer heap_t_ctid,
Relation heapRelation,
IndexUniqueCheck checkUnique,
bool indexUnchanged,
struct IndexInfo *indexInfo);
extern void index_insert_cleanup(Relation indexRelation,
struct IndexInfo *indexInfo);
extern IndexScanDesc index_beginscan(Relation heapRelation,
Relation indexRelation,
Snapshot snapshot,
int nkeys, int norderbys);
extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
Snapshot snapshot,
int nkeys);
extern void index_rescan(IndexScanDesc scan,
ScanKey keys, int nkeys,
ScanKey orderbys, int norderbys);
extern void index_endscan(IndexScanDesc scan);
extern void index_markpos(IndexScanDesc scan);
extern void index_restrpos(IndexScanDesc scan);
Enhance nbtree ScalarArrayOp execution. Commit 9e8da0f7 taught nbtree to handle ScalarArrayOpExpr quals natively. This works by pushing down the full context (the array keys) to the nbtree index AM, enabling it to execute multiple primitive index scans that the planner treats as one continuous index scan/index path. This earlier enhancement enabled nbtree ScalarArrayOp index-only scans. It also allowed scans with ScalarArrayOp quals to return ordered results (with some notable restrictions, described further down). Take this general approach a lot further: teach nbtree SAOP index scans to decide how to execute ScalarArrayOp scans (when and where to start the next primitive index scan) based on physical index characteristics. This can be far more efficient. All SAOP scans will now reliably avoid duplicative leaf page accesses (just like any other nbtree index scan). SAOP scans whose array keys are naturally clustered together now require far fewer index descents, since we'll reliably avoid starting a new primitive scan just to get to a later offset from the same leaf page. The scan's arrays now advance using binary searches for the array element that best matches the next tuple's attribute value. Required scan key arrays (i.e. arrays from scan keys that can terminate the scan) ratchet forward in lockstep with the index scan. Non-required arrays (i.e. arrays from scan keys that can only exclude non-matching tuples) "advance" without the process ever rolling over to a higher-order array. Naturally, only required SAOP scan keys trigger skipping over leaf pages (non-required arrays cannot safely end or start primitive index scans). Consequently, even index scans of a composite index with a high-order inequality scan key (which we'll mark required) and a low-order SAOP scan key (which we won't mark required) now avoid repeating leaf page accesses -- that benefit isn't limited to simpler equality-only cases. In general, all nbtree index scans now output tuples as if they were one continuous index scan -- even scans that mix a high-order inequality with lower-order SAOP equalities reliably output tuples in index order. This allows us to remove a couple of special cases that were applied when building index paths with SAOP clauses during planning. Bugfix commit 807a40c5 taught the planner to avoid generating unsafe path keys: path keys on a multicolumn index path, with a SAOP clause on any attribute beyond the first/most significant attribute. These cases are now all safe, so we go back to generating path keys without regard for the presence of SAOP clauses (just like with any other clause type). Affected queries can now exploit scan output order in all the usual ways (e.g., certain "ORDER BY ... LIMIT n" queries can now terminate early). Also undo changes from follow-up bugfix commit a4523c5a, which taught the planner to produce alternative index paths, with path keys, but without low-order SAOP index quals (filter quals were used instead). We'll no longer generate these alternative paths, since they can no longer offer any meaningful advantages over standard index qual paths. Affected queries thereby avoid all of the disadvantages that come from using filter quals within index scan nodes. They can avoid extra heap page accesses from using filter quals to exclude non-matching tuples (index quals will never have that problem). They can also skip over irrelevant sections of the index in more cases (though only when nbtree determines that starting another primitive scan actually makes sense). There is a theoretical risk that removing restrictions on SAOP index paths from the planner will break compatibility with amcanorder-based index AMs maintained as extensions. Such an index AM could have the same limitations around ordered SAOP scans as nbtree had up until now. Adding a pro forma incompatibility item about the issue to the Postgres 17 release notes seems like a good idea. Author: Peter Geoghegan <pg@bowt.ie> Author: Matthias van de Meent <boekewurm+postgres@gmail.com> Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-By: Matthias van de Meent <boekewurm+postgres@gmail.com> Reviewed-By: Tomas Vondra <tomas.vondra@enterprisedb.com> Discussion: https://postgr.es/m/CAH2-Wz=ksvN_sjcnD1+Bt-WtifRA5ok48aDYnq3pkKhxgMQpcw@mail.gmail.com
2024-04-06 17:47:10 +02:00
extern Size index_parallelscan_estimate(Relation indexRelation,
int nkeys, int norderbys, Snapshot snapshot);
extern void index_parallelscan_initialize(Relation heapRelation,
Relation indexRelation, Snapshot snapshot,
ParallelIndexScanDesc target);
extern void index_parallelrescan(IndexScanDesc scan);
extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
Relation indexrel, int nkeys, int norderbys,
ParallelIndexScanDesc pscan);
extern ItemPointer index_getnext_tid(IndexScanDesc scan,
ScanDirection direction);
tableam: Add and use scan APIs. Too allow table accesses to be not directly dependent on heap, several new abstractions are needed. Specifically: 1) Heap scans need to be generalized into table scans. Do this by introducing TableScanDesc, which will be the "base class" for individual AMs. This contains the AM independent fields from HeapScanDesc. The previous heap_{beginscan,rescan,endscan} et al. have been replaced with a table_ version. There's no direct replacement for heap_getnext(), as that returned a HeapTuple, which is undesirable for a other AMs. Instead there's table_scan_getnextslot(). But note that heap_getnext() lives on, it's still used widely to access catalog tables. This is achieved by new scan_begin, scan_end, scan_rescan, scan_getnextslot callbacks. 2) The portion of parallel scans that's shared between backends need to be able to do so without the user doing per-AM work. To achieve that new parallelscan_{estimate, initialize, reinitialize} callbacks are introduced, which operate on a new ParallelTableScanDesc, which again can be subclassed by AMs. As it is likely that several AMs are going to be block oriented, block oriented callbacks that can be shared between such AMs are provided and used by heap. table_block_parallelscan_{estimate, intiialize, reinitialize} as callbacks, and table_block_parallelscan_{nextpage, init} for use in AMs. These operate on a ParallelBlockTableScanDesc. 3) Index scans need to be able to access tables to return a tuple, and there needs to be state across individual accesses to the heap to store state like buffers. That's now handled by introducing a sort-of-scan IndexFetchTable, which again is intended to be subclassed by individual AMs (for heap IndexFetchHeap). The relevant callbacks for an AM are index_fetch_{end, begin, reset} to create the necessary state, and index_fetch_tuple to retrieve an indexed tuple. Note that index_fetch_tuple implementations need to be smarter than just blindly fetching the tuples for AMs that have optimizations similar to heap's HOT - the currently alive tuple in the update chain needs to be fetched if appropriate. Similar to table_scan_getnextslot(), it's undesirable to continue to return HeapTuples. Thus index_fetch_heap (might want to rename that later) now accepts a slot as an argument. Core code doesn't have a lot of call sites performing index scans without going through the systable_* API (in contrast to loads of heap_getnext calls and working directly with HeapTuples). Index scans now store the result of a search in IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the target is not generally a HeapTuple anymore that seems cleaner. To be able to sensible adapt code to use the above, two further callbacks have been introduced: a) slot_callbacks returns a TupleTableSlotOps* suitable for creating slots capable of holding a tuple of the AMs type. table_slot_callbacks() and table_slot_create() are based upon that, but have additional logic to deal with views, foreign tables, etc. While this change could have been done separately, nearly all the call sites that needed to be adapted for the rest of this commit also would have been needed to be adapted for table_slot_callbacks(), making separation not worthwhile. b) tuple_satisfies_snapshot checks whether the tuple in a slot is currently visible according to a snapshot. That's required as a few places now don't have a buffer + HeapTuple around, but a slot (which in heap's case internally has that information). Additionally a few infrastructure changes were needed: I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now internally uses a slot to keep track of tuples. While systable_getnext() still returns HeapTuples, and will so for the foreseeable future, the index API (see 1) above) now only deals with slots. The remainder, and largest part, of this commit is then adjusting all scans in postgres to use the new APIs. Author: Andres Freund, Haribabu Kommi, Alvaro Herrera Discussion: https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 20:46:41 +01:00
struct TupleTableSlot;
extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot);
extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
struct TupleTableSlot *slot);
extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
Simplify state managed by VACUUM. Reorganize the state struct used by VACUUM -- group related items together to make it easier to understand. Also stop relying on stack variables inside lazy_scan_heap() -- move those into the state struct instead. Doing things this way simplifies large groups of related functions whose function signatures had a lot of unnecessary redundancy. Switch over to using int64 for the struct fields used to count things that are reported to the user via log_autovacuum and VACUUM VERBOSE output. We were using double, but that doesn't seem to have any advantages. Using int64 makes it possible to add assertions that verify that the first pass over the heap (pruning) encounters precisely the same number of LP_DEAD items that get deleted from indexes later on, in the second pass over the heap. These assertions will be added in later commits. Finally, adjust the signatures of functions with IndexBulkDeleteResult pointer arguments in cases where there was ambiguity about whether or not the argument relates to a single index or all indexes. Functions now use the idiom that both ambulkdelete() and amvacuumcleanup() have always used (where appropriate): accept a mutable IndexBulkDeleteResult pointer argument, and return a result IndexBulkDeleteResult pointer to caller. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkeOSYwC6KNckbhk2b1aNnWum6Yyn0NKP9D-Hq1LGTDPw@mail.gmail.com
2021-04-05 22:21:44 +02:00
IndexBulkDeleteResult *istat,
IndexBulkDeleteCallback callback,
void *callback_state);
extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
Simplify state managed by VACUUM. Reorganize the state struct used by VACUUM -- group related items together to make it easier to understand. Also stop relying on stack variables inside lazy_scan_heap() -- move those into the state struct instead. Doing things this way simplifies large groups of related functions whose function signatures had a lot of unnecessary redundancy. Switch over to using int64 for the struct fields used to count things that are reported to the user via log_autovacuum and VACUUM VERBOSE output. We were using double, but that doesn't seem to have any advantages. Using int64 makes it possible to add assertions that verify that the first pass over the heap (pruning) encounters precisely the same number of LP_DEAD items that get deleted from indexes later on, in the second pass over the heap. These assertions will be added in later commits. Finally, adjust the signatures of functions with IndexBulkDeleteResult pointer arguments in cases where there was ambiguity about whether or not the argument relates to a single index or all indexes. Functions now use the idiom that both ambulkdelete() and amvacuumcleanup() have always used (where appropriate): accept a mutable IndexBulkDeleteResult pointer argument, and return a result IndexBulkDeleteResult pointer to caller. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com> Reviewed-By: Robert Haas <robertmhaas@gmail.com> Discussion: https://postgr.es/m/CAH2-WzkeOSYwC6KNckbhk2b1aNnWum6Yyn0NKP9D-Hq1LGTDPw@mail.gmail.com
2021-04-05 22:21:44 +02:00
IndexBulkDeleteResult *istat);
extern bool index_can_return(Relation indexRelation, int attno);
extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
uint16 procnum);
extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
uint16 procnum);
extern void index_store_float8_orderby_distances(IndexScanDesc scan,
Oid *orderByTypes,
IndexOrderByDistance *distances,
bool recheckOrderBy);
extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
Implement operator class parameters PostgreSQL provides set of template index access methods, where opclasses have much freedom in the semantics of indexing. These index AMs are GiST, GIN, SP-GiST and BRIN. There opclasses define representation of keys, operations on them and supported search strategies. So, it's natural that opclasses may be faced some tradeoffs, which require user-side decision. This commit implements opclass parameters allowing users to set some values, which tell opclass how to index the particular dataset. This commit doesn't introduce new storage in system catalog. Instead it uses pg_attribute.attoptions, which is used for table column storage options but unused for index attributes. In order to evade changing signature of each opclass support function, we implement unified way to pass options to opclass support functions. Options are set to fn_expr as the constant bytea expression. It's possible due to the fact that opclass support functions are executed outside of expressions, so fn_expr is unused for them. This commit comes with some examples of opclass options usage. We parametrize signature length in GiST. That applies to multiple opclasses: tsvector_ops, gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and gist_hstore_ops. Also we parametrize maximum number of integer ranges for gist__int_ops. However, the main future usage of this feature is expected to be json, where users would be able to specify which way to index particular json parts. Catversion is bumped. Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru Author: Nikita Glukhov, revised by me Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
Datum attoptions, bool validate);
/*
* index access method support routines (in genam.c)
*/
extern IndexScanDesc RelationGetIndexScan(Relation indexRelation,
int nkeys, int norderbys);
extern void IndexScanEnd(IndexScanDesc scan);
extern char *BuildIndexValueDescription(Relation indexRelation,
const Datum *values, const bool *isnull);
Compute XID horizon for page level index vacuum on primary. Previously the xid horizon was only computed during WAL replay. That had two major problems: 1) It relied on knowing what the table pointed to looks like. That was easy enough before the introducing of tableam (we knew it had to be heap, although some trickery around logging the heap relfilenodes was required). But to properly handle table AMs we need per-database catalog access to look up the AM handler, which recovery doesn't allow. 2) Not knowing the xid horizon also makes it hard to support logical decoding on standbys. When on a catalog table, we need to be able to conflict with slots that have an xid horizon that's too old. But computing the horizon by visiting the heap only works once consistency is reached, but we always need to be able to detect conflicts. There's also a secondary problem, in that the current method performs redundant work on every standby. But that's counterbalanced by potentially computing the value when not necessary (either because there's no standby, or because there's no connected backends). Solve 1) and 2) by moving computation of the xid horizon to the primary and by involving tableam in the computation of the horizon. To address the potentially increased overhead, increase the efficiency of the xid horizon computation for heap by sorting the tids, and eliminating redundant buffer accesses. When prefetching is available, additionally perform prefetching of buffers. As this is more of a maintenance task, rather than something routinely done in every read only query, we add an arbitrary 10 to the effective concurrency - thereby using IO concurrency, when not globally enabled. That's possibly not the perfect formula, but seems good enough for now. Bumps WAL format, as latestRemovedXid is now part of the records, and the heap's relfilenode isn't anymore. Author: Andres Freund, Amit Khandekar, Robert Haas Reviewed-By: Robert Haas Discussion: https://postgr.es/m/20181212204154.nsxf3gzqv3gesl32@alap3.anarazel.de https://postgr.es/m/20181214014235.dal5ogljs3bmlq44@alap3.anarazel.de https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
2019-03-26 22:41:46 +01:00
extern TransactionId index_compute_xid_horizon_for_tuples(Relation irel,
Relation hrel,
Buffer ibuf,
OffsetNumber *itemnos,
int nitems);
/*
* heap-or-index access to system catalogs (in genam.c)
*/
extern SysScanDesc systable_beginscan(Relation heapRelation,
Oid indexId,
bool indexOK,
Snapshot snapshot,
int nkeys, ScanKey key);
extern HeapTuple systable_getnext(SysScanDesc sysscan);
extern bool systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup);
extern void systable_endscan(SysScanDesc sysscan);
extern SysScanDesc systable_beginscan_ordered(Relation heapRelation,
Relation indexRelation,
Snapshot snapshot,
int nkeys, ScanKey key);
extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan,
ScanDirection direction);
extern void systable_endscan_ordered(SysScanDesc sysscan);
#endif /* GENAM_H */