1996-08-28 03:59:28 +02:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
1999-02-14 00:22:53 +01:00
|
|
|
* rel.h
|
2000-01-31 05:35:57 +01:00
|
|
|
* POSTGRES relation descriptor (a/k/a relcache entry) definitions.
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
|
|
|
*
|
2021-01-02 19:06:25 +01:00
|
|
|
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
2000-01-26 06:58:53 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/include/utils/rel.h
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#ifndef REL_H
|
|
|
|
#define REL_H
|
|
|
|
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "access/tupdesc.h"
|
2016-04-08 21:36:30 +02:00
|
|
|
#include "access/xlog.h"
|
1999-07-16 19:07:40 +02:00
|
|
|
#include "catalog/pg_class.h"
|
2002-02-19 21:11:20 +01:00
|
|
|
#include "catalog/pg_index.h"
|
2017-01-19 18:00:00 +01:00
|
|
|
#include "catalog/pg_publication.h"
|
2007-09-20 19:56:33 +02:00
|
|
|
#include "nodes/bitmapset.h"
|
Load relcache entries' partitioning data on-demand, not immediately.
Formerly the rd_partkey and rd_partdesc data structures were always
populated immediately when a relcache entry was built or rebuilt.
This patch changes things so that they are populated only when they
are first requested. (Hence, callers *must* now always use
RelationGetPartitionKey or RelationGetPartitionDesc; just fetching
the pointer directly is no longer acceptable.)
This seems to have some performance benefits, but the main reason to do
it is that it eliminates a recursive-reload failure that occurs if the
partkey or partdesc expressions contain any references to the relation's
rowtype (as discovered by Amit Langote). In retrospect, since loading
these data structures might result in execution of nearly-arbitrary code
via eval_const_expressions, it was a dumb idea to require that to happen
during relcache entry rebuild.
Also, fix things so that old copies of a relcache partition descriptor
will be dropped when the cache entry's refcount goes to zero. In the
previous coding it was possible for such copies to survive for the
lifetime of the session, as I'd complained of in a previous discussion.
(This management technique still isn't perfect, but it's better than
before.) Improve the commentary explaining how that works and why
it's safe to hand out direct pointers to these relcache substructures.
In passing, improve RelationBuildPartitionDesc by using the same
memory-context-parent-swap approach used by RelationBuildPartitionKey,
thereby making it less dependent on strong assumptions about what
partition_bounds_copy does. Avoid doing get_rel_relkind in the
critical section, too.
Patch by Amit Langote and Tom Lane; Robert Haas deserves some credit
for prior work in the area, too. Although this is a pre-existing
problem, no back-patch: the patch seems too invasive to be safe to
back-patch, and the bug it fixes is a corner case that seems
relatively unlikely to cause problems in the field.
Discussion: https://postgr.es/m/CA+HiwqFUzjfj9HEsJtYWcr1SgQ_=iCAvQ=O2Sx6aQxoDu4OiHw@mail.gmail.com
Discussion: https://postgr.es/m/CA+TgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw@mail.gmail.com
2019-12-25 20:43:13 +01:00
|
|
|
#include "partitioning/partdefs.h"
|
1999-07-16 01:04:24 +02:00
|
|
|
#include "rewrite/prs2lock.h"
|
2001-06-28 01:31:40 +02:00
|
|
|
#include "storage/block.h"
|
|
|
|
#include "storage/relfilenode.h"
|
2021-07-12 23:01:29 +02:00
|
|
|
#include "storage/smgr.h"
|
2008-06-19 02:46:06 +02:00
|
|
|
#include "utils/relcache.h"
|
2011-02-23 18:18:09 +01:00
|
|
|
#include "utils/reltrigger.h"
|
1996-08-28 03:59:28 +02:00
|
|
|
|
2000-01-31 05:35:57 +01:00
|
|
|
|
1999-09-18 21:08:25 +02:00
|
|
|
/*
|
|
|
|
* LockRelId and LockInfo really belong to lmgr.h, but it's more convenient
|
|
|
|
* to declare them here so we can have a LockInfoData field in a Relation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct LockRelId
|
|
|
|
{
|
|
|
|
Oid relId; /* a relation identifier */
|
|
|
|
Oid dbId; /* a database identifier */
|
|
|
|
} LockRelId;
|
|
|
|
|
|
|
|
typedef struct LockInfoData
|
|
|
|
{
|
|
|
|
LockRelId lockRelId;
|
|
|
|
} LockInfoData;
|
|
|
|
|
|
|
|
typedef LockInfoData *LockInfo;
|
|
|
|
|
2000-01-31 05:35:57 +01:00
|
|
|
/*
|
|
|
|
* Here are the contents of a relation cache entry.
|
|
|
|
*/
|
1999-09-18 21:08:25 +02:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
typedef struct RelationData
|
|
|
|
{
|
2004-02-10 02:55:27 +01:00
|
|
|
RelFileNode rd_node; /* relation physical identifier */
|
2021-07-12 23:01:29 +02:00
|
|
|
SMgrRelation rd_smgr; /* cached file handle, or NULL */
|
2001-06-28 01:31:40 +02:00
|
|
|
int rd_refcnt; /* reference count */
|
2010-08-13 22:10:54 +02:00
|
|
|
BackendId rd_backend; /* owning backend id, if temporary relation */
|
2012-12-18 02:15:32 +01:00
|
|
|
bool rd_islocaltemp; /* rel is a temp rel of this session */
|
2004-08-28 22:31:44 +02:00
|
|
|
bool rd_isnailed; /* rel is nailed in cache */
|
|
|
|
bool rd_isvalid; /* relcache entry is valid */
|
2019-05-03 16:26:14 +02:00
|
|
|
bool rd_indexvalid; /* is rd_indexlist valid? (also rd_pkindex and
|
|
|
|
* rd_replidindex) */
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
bool rd_statvalid; /* is rd_statlist valid? */
|
2004-08-29 07:07:03 +02:00
|
|
|
|
2020-03-21 17:38:26 +01:00
|
|
|
/*----------
|
2004-09-16 18:58:44 +02:00
|
|
|
* rd_createSubid is the ID of the highest subtransaction the rel has
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
* survived into or zero if the rel or its rd_node was created before the
|
|
|
|
* current top transaction. (IndexStmt.oldNode leads to the case of a new
|
|
|
|
* rel with an old rd_node.) rd_firstRelfilenodeSubid is the ID of the
|
|
|
|
* highest subtransaction an rd_node change has survived into or zero if
|
|
|
|
* rd_node matches the value it had at the start of the current top
|
|
|
|
* transaction. (Rolling back the subtransaction that
|
|
|
|
* rd_firstRelfilenodeSubid denotes would restore rd_node to the value it
|
|
|
|
* had at the start of the current top transaction. Rolling back any
|
|
|
|
* lower subtransaction would not.) Their accuracy is critical to
|
|
|
|
* RelationNeedsWAL().
|
|
|
|
*
|
|
|
|
* rd_newRelfilenodeSubid is the ID of the highest subtransaction the
|
|
|
|
* most-recent relfilenode change has survived into or zero if not changed
|
|
|
|
* in the current transaction (or we have forgotten changing it). This
|
|
|
|
* field is accurate when non-zero, but it can be zero when a relation has
|
|
|
|
* multiple new relfilenodes within a single transaction, with one of them
|
|
|
|
* occurring in a subsequently aborted subtransaction, e.g.
|
2020-03-21 17:38:26 +01:00
|
|
|
* BEGIN;
|
|
|
|
* TRUNCATE t;
|
|
|
|
* SAVEPOINT save;
|
|
|
|
* TRUNCATE t;
|
|
|
|
* ROLLBACK TO save;
|
|
|
|
* -- rd_newRelfilenodeSubid is now forgotten
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
*
|
|
|
|
* If every rd_*Subid field is zero, they are read-only outside
|
|
|
|
* relcache.c. Files that trigger rd_node changes by updating
|
|
|
|
* pg_class.reltablespace and/or pg_class.relfilenode call
|
|
|
|
* RelationAssumeNewRelfilenode() to update rd_*Subid.
|
|
|
|
*
|
|
|
|
* rd_droppedSubid is the ID of the highest subtransaction that a drop of
|
|
|
|
* the rel has survived into. In entries visible outside relcache.c, this
|
|
|
|
* is always zero.
|
2004-08-28 22:31:44 +02:00
|
|
|
*/
|
2010-01-10 23:19:17 +01:00
|
|
|
SubTransactionId rd_createSubid; /* rel was created in current xact */
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
SubTransactionId rd_newRelfilenodeSubid; /* highest subxact changing
|
|
|
|
* rd_node to current value */
|
|
|
|
SubTransactionId rd_firstRelfilenodeSubid; /* highest subxact changing
|
|
|
|
* rd_node to any value */
|
|
|
|
SubTransactionId rd_droppedSubid; /* dropped with another Subid set */
|
2010-01-10 23:19:17 +01:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
Form_pg_class rd_rel; /* RELATION tuple */
|
2002-02-19 21:11:20 +01:00
|
|
|
TupleDesc rd_att; /* tuple descriptor */
|
1999-09-18 21:08:25 +02:00
|
|
|
Oid rd_id; /* relation's object id */
|
2000-06-17 23:49:04 +02:00
|
|
|
LockInfoData rd_lockInfo; /* lock mgr's info for locking relation */
|
1996-08-28 03:59:28 +02:00
|
|
|
RuleLock *rd_rules; /* rewrite rules */
|
2000-06-30 09:04:23 +02:00
|
|
|
MemoryContext rd_rulescxt; /* private memory cxt for rd_rules, if any */
|
2000-01-31 05:35:57 +01:00
|
|
|
TriggerDesc *trigdesc; /* Trigger info, or NULL if rel has none */
|
Clean up includes from RLS patch
The initial patch for RLS mistakenly included headers associated with
the executor and planner bits in rewrite/rowsecurity.h. Per policy and
general good sense, executor headers should not be included in planner
headers or vice versa.
The include of execnodes.h was a mistaken holdover from previous
versions, while the include of relation.h was used for Relation's
definition, which should have been coming from utils/relcache.h. This
patch cleans these issues up, adds comments to the RowSecurityPolicy
struct and the RowSecurityConfigType enum, and changes Relation->rsdesc
to Relation->rd_rsdesc to follow Relation field naming convention.
Additionally, utils/rel.h was including rewrite/rowsecurity.h, which
wasn't a great idea since that was pulling in things not really needed
in utils/rel.h (which gets included in quite a few places). Instead,
use 'struct RowSecurityDesc' for the rd_rsdesc field and add comments
explaining why.
Lastly, add an include into access/nbtree/nbtsort.c for
utils/sortsupport.h, which was evidently missed due to the above mess.
Pointed out by Tom in 16970.1415838651@sss.pgh.pa.us; note that the
concerns regarding a similar situation in the custom-path commit still
need to be addressed.
2014-11-14 22:53:51 +01:00
|
|
|
/* use "struct" here to avoid needing to include rowsecurity.h: */
|
Rename pg_rowsecurity -> pg_policy and other fixes
As pointed out by Robert, we should really have named pg_rowsecurity
pg_policy, as the objects stored in that catalog are policies. This
patch fixes that and updates the column names to start with 'pol' to
match the new catalog name.
The security consideration for COPY with row level security, also
pointed out by Robert, has also been addressed by remembering and
re-checking the OID of the relation initially referenced during COPY
processing, to make sure it hasn't changed under us by the time we
finish planning out the query which has been built.
Robert and Alvaro also commented on missing OCLASS and OBJECT entries
for POLICY (formerly ROWSECURITY or POLICY, depending) in various
places. This patch fixes that too, which also happens to add the
ability to COMMENT on policies.
In passing, attempt to improve the consistency of messages, comments,
and documentation as well. This removes various incarnations of
'row-security', 'row-level security', 'Row-security', etc, in favor
of 'policy', 'row level security' or 'row_security' as appropriate.
Happy Thanksgiving!
2014-11-27 07:06:36 +01:00
|
|
|
struct RowSecurityDesc *rd_rsdesc; /* row security policies, or NULL */
|
2001-06-22 21:16:24 +02:00
|
|
|
|
2016-06-18 21:22:34 +02:00
|
|
|
/* data managed by RelationGetFKeyList: */
|
|
|
|
List *rd_fkeylist; /* list of ForeignKeyCacheInfo (see below) */
|
|
|
|
bool rd_fkeyvalid; /* true if list has been computed */
|
|
|
|
|
2019-12-26 17:20:05 +01:00
|
|
|
/* data managed by RelationGetPartitionKey: */
|
Load relcache entries' partitioning data on-demand, not immediately.
Formerly the rd_partkey and rd_partdesc data structures were always
populated immediately when a relcache entry was built or rebuilt.
This patch changes things so that they are populated only when they
are first requested. (Hence, callers *must* now always use
RelationGetPartitionKey or RelationGetPartitionDesc; just fetching
the pointer directly is no longer acceptable.)
This seems to have some performance benefits, but the main reason to do
it is that it eliminates a recursive-reload failure that occurs if the
partkey or partdesc expressions contain any references to the relation's
rowtype (as discovered by Amit Langote). In retrospect, since loading
these data structures might result in execution of nearly-arbitrary code
via eval_const_expressions, it was a dumb idea to require that to happen
during relcache entry rebuild.
Also, fix things so that old copies of a relcache partition descriptor
will be dropped when the cache entry's refcount goes to zero. In the
previous coding it was possible for such copies to survive for the
lifetime of the session, as I'd complained of in a previous discussion.
(This management technique still isn't perfect, but it's better than
before.) Improve the commentary explaining how that works and why
it's safe to hand out direct pointers to these relcache substructures.
In passing, improve RelationBuildPartitionDesc by using the same
memory-context-parent-swap approach used by RelationBuildPartitionKey,
thereby making it less dependent on strong assumptions about what
partition_bounds_copy does. Avoid doing get_rel_relkind in the
critical section, too.
Patch by Amit Langote and Tom Lane; Robert Haas deserves some credit
for prior work in the area, too. Although this is a pre-existing
problem, no back-patch: the patch seems too invasive to be safe to
back-patch, and the bug it fixes is a corner case that seems
relatively unlikely to cause problems in the field.
Discussion: https://postgr.es/m/CA+HiwqFUzjfj9HEsJtYWcr1SgQ_=iCAvQ=O2Sx6aQxoDu4OiHw@mail.gmail.com
Discussion: https://postgr.es/m/CA+TgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw@mail.gmail.com
2019-12-25 20:43:13 +01:00
|
|
|
PartitionKey rd_partkey; /* partition key, or NULL */
|
2019-04-13 19:22:26 +02:00
|
|
|
MemoryContext rd_partkeycxt; /* private context for rd_partkey, if any */
|
2019-12-26 17:20:05 +01:00
|
|
|
|
|
|
|
/* data managed by RelationGetPartitionDesc: */
|
Load relcache entries' partitioning data on-demand, not immediately.
Formerly the rd_partkey and rd_partdesc data structures were always
populated immediately when a relcache entry was built or rebuilt.
This patch changes things so that they are populated only when they
are first requested. (Hence, callers *must* now always use
RelationGetPartitionKey or RelationGetPartitionDesc; just fetching
the pointer directly is no longer acceptable.)
This seems to have some performance benefits, but the main reason to do
it is that it eliminates a recursive-reload failure that occurs if the
partkey or partdesc expressions contain any references to the relation's
rowtype (as discovered by Amit Langote). In retrospect, since loading
these data structures might result in execution of nearly-arbitrary code
via eval_const_expressions, it was a dumb idea to require that to happen
during relcache entry rebuild.
Also, fix things so that old copies of a relcache partition descriptor
will be dropped when the cache entry's refcount goes to zero. In the
previous coding it was possible for such copies to survive for the
lifetime of the session, as I'd complained of in a previous discussion.
(This management technique still isn't perfect, but it's better than
before.) Improve the commentary explaining how that works and why
it's safe to hand out direct pointers to these relcache substructures.
In passing, improve RelationBuildPartitionDesc by using the same
memory-context-parent-swap approach used by RelationBuildPartitionKey,
thereby making it less dependent on strong assumptions about what
partition_bounds_copy does. Avoid doing get_rel_relkind in the
critical section, too.
Patch by Amit Langote and Tom Lane; Robert Haas deserves some credit
for prior work in the area, too. Although this is a pre-existing
problem, no back-patch: the patch seems too invasive to be safe to
back-patch, and the bug it fixes is a corner case that seems
relatively unlikely to cause problems in the field.
Discussion: https://postgr.es/m/CA+HiwqFUzjfj9HEsJtYWcr1SgQ_=iCAvQ=O2Sx6aQxoDu4OiHw@mail.gmail.com
Discussion: https://postgr.es/m/CA+TgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw@mail.gmail.com
2019-12-25 20:43:13 +01:00
|
|
|
PartitionDesc rd_partdesc; /* partition descriptor, or NULL */
|
2019-04-13 19:22:26 +02:00
|
|
|
MemoryContext rd_pdcxt; /* private context for rd_partdesc, if any */
|
2019-12-26 17:20:05 +01:00
|
|
|
|
2021-04-28 21:44:35 +02:00
|
|
|
/* Same as above, for partdescs that omit detached partitions */
|
|
|
|
PartitionDesc rd_partdesc_nodetached; /* partdesc w/o detached parts */
|
|
|
|
MemoryContext rd_pddcxt; /* for rd_partdesc_nodetached, if any */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pg_inherits.xmin of the partition that was excluded in
|
|
|
|
* rd_partdesc_nodetached. This informs a future user of that partdesc:
|
|
|
|
* if this value is not in progress for the active snapshot, then the
|
|
|
|
* partdesc can be used, otherwise they have to build a new one. (This
|
2021-05-06 18:47:30 +02:00
|
|
|
* matches what find_inheritance_children_extended would do).
|
2021-04-28 21:44:35 +02:00
|
|
|
*/
|
|
|
|
TransactionId rd_partdesc_nodetached_xmin;
|
|
|
|
|
2019-12-26 17:20:05 +01:00
|
|
|
/* data managed by RelationGetPartitionQual: */
|
2017-01-24 16:20:02 +01:00
|
|
|
List *rd_partcheck; /* partition CHECK quals */
|
2019-04-13 19:22:26 +02:00
|
|
|
bool rd_partcheckvalid; /* true if list has been computed */
|
|
|
|
MemoryContext rd_partcheckcxt; /* private cxt for rd_partcheck, if any */
|
Implement table partitioning.
Table partitioning is like table inheritance and reuses much of the
existing infrastructure, but there are some important differences.
The parent is called a partitioned table and is always empty; it may
not have indexes or non-inherited constraints, since those make no
sense for a relation with no data of its own. The children are called
partitions and contain all of the actual data. Each partition has an
implicit partitioning constraint. Multiple inheritance is not
allowed, and partitioning and inheritance can't be mixed. Partitions
can't have extra columns and may not allow nulls unless the parent
does. Tuples inserted into the parent are automatically routed to the
correct partition, so tuple-routing ON INSERT triggers are not needed.
Tuple routing isn't yet supported for partitions which are foreign
tables, and it doesn't handle updates that cross partition boundaries.
Currently, tables can be range-partitioned or list-partitioned. List
partitioning is limited to a single column, but range partitioning can
involve multiple columns. A partitioning "column" can be an
expression.
Because table partitioning is less general than table inheritance, it
is hoped that it will be easier to reason about properties of
partitions, and therefore that this will serve as a better foundation
for a variety of possible optimizations, including query planner
optimizations. The tuple routing based which this patch does based on
the implicit partitioning constraints is an example of this, but it
seems likely that many other useful optimizations are also possible.
Amit Langote, reviewed and tested by Robert Haas, Ashutosh Bapat,
Amit Kapila, Rajkumar Raghuwanshi, Corey Huinker, Jaime Casanova,
Rushabh Lathia, Erik Rijkers, among others. Minor revisions by me.
2016-12-07 19:17:43 +01:00
|
|
|
|
2014-05-14 20:55:48 +02:00
|
|
|
/* data managed by RelationGetIndexList: */
|
|
|
|
List *rd_indexlist; /* list of OIDs of indexes on relation */
|
2017-01-19 18:00:00 +01:00
|
|
|
Oid rd_pkindex; /* OID of primary key, if any */
|
2014-05-14 20:55:48 +02:00
|
|
|
Oid rd_replidindex; /* OID of replica identity index, if any */
|
|
|
|
|
Implement multivariate n-distinct coefficients
Add support for explicitly declared statistic objects (CREATE
STATISTICS), allowing collection of statistics on more complex
combinations that individual table columns. Companion commands DROP
STATISTICS and ALTER STATISTICS ... OWNER TO / SET SCHEMA / RENAME are
added too. All this DDL has been designed so that more statistic types
can be added later on, such as multivariate most-common-values and
multivariate histograms between columns of a single table, leaving room
for permitting columns on multiple tables, too, as well as expressions.
This commit only adds support for collection of n-distinct coefficient
on user-specified sets of columns in a single table. This is useful to
estimate number of distinct groups in GROUP BY and DISTINCT clauses;
estimation errors there can cause over-allocation of memory in hashed
aggregates, for instance, so it's a worthwhile problem to solve. A new
special pseudo-type pg_ndistinct is used.
(num-distinct estimation was deemed sufficiently useful by itself that
this is worthwhile even if no further statistic types are added
immediately; so much so that another version of essentially the same
functionality was submitted by Kyotaro Horiguchi:
https://postgr.es/m/20150828.173334.114731693.horiguchi.kyotaro@lab.ntt.co.jp
though this commit does not use that code.)
Author: Tomas Vondra. Some code rework by Álvaro.
Reviewed-by: Dean Rasheed, David Rowley, Kyotaro Horiguchi, Jeff Janes,
Ideriha Takeshi
Discussion: https://postgr.es/m/543AFA15.4080608@fuzzy.cz
https://postgr.es/m/20170320190220.ixlaueanxegqd5gr@alvherre.pgsql
2017-03-24 18:06:10 +01:00
|
|
|
/* data managed by RelationGetStatExtList: */
|
|
|
|
List *rd_statlist; /* list of OIDs of extended stats */
|
|
|
|
|
2014-05-14 20:55:48 +02:00
|
|
|
/* data managed by RelationGetIndexAttrBitmap: */
|
2021-11-30 19:15:14 +01:00
|
|
|
bool rd_attrsvalid; /* are bitmaps of attrs valid? */
|
2014-05-14 20:55:48 +02:00
|
|
|
Bitmapset *rd_keyattr; /* cols that can be ref'd by foreign keys */
|
2017-01-19 18:00:00 +01:00
|
|
|
Bitmapset *rd_pkattr; /* cols included in primary key */
|
2014-05-14 20:55:48 +02:00
|
|
|
Bitmapset *rd_idattr; /* included in replica identity index */
|
2021-11-30 19:15:14 +01:00
|
|
|
Bitmapset *rd_hotblockingattr; /* cols blocking HOT update */
|
2013-11-08 18:30:43 +01:00
|
|
|
|
2017-01-19 18:00:00 +01:00
|
|
|
PublicationActions *rd_pubactions; /* publication actions */
|
|
|
|
|
2006-07-04 00:45:41 +02:00
|
|
|
/*
|
|
|
|
* rd_options is set whenever rd_rel is loaded into the relcache entry.
|
|
|
|
* Note that you can NOT look into rd_rel for this data. NULL means "use
|
|
|
|
* defaults".
|
|
|
|
*/
|
|
|
|
bytea *rd_options; /* parsed pg_class.reloptions */
|
|
|
|
|
tableam: introduce table AM infrastructure.
This introduces the concept of table access methods, i.e. CREATE
ACCESS METHOD ... TYPE TABLE and
CREATE TABLE ... USING (storage-engine).
No table access functionality is delegated to table AMs as of this
commit, that'll be done in following commits.
Subsequent commits will incrementally abstract table access
functionality to be routed through table access methods. That change
is too large to be reviewed & committed at once, so it'll be done
incrementally.
Docs will be updated at the end, as adding them incrementally would
likely make them less coherent, and definitely is a lot more work,
without a lot of benefit.
Table access methods are specified similar to index access methods,
i.e. pg_am.amhandler returns, as INTERNAL, a pointer to a struct with
callbacks. In contrast to index AMs that struct needs to live as long
as a backend, typically that's achieved by just returning a pointer to
a constant struct.
Psql's \d+ now displays a table's access method. That can be disabled
with HIDE_TABLEAM=true, which is mainly useful so regression tests can
be run against different AMs. It's quite possible that this behaviour
still needs to be fine tuned.
For now it's not allowed to set a table AM for a partitioned table, as
we've not resolved how partitions would inherit that. Disallowing
allows us to introduce, if we decide that's the way forward, such a
behaviour without a compatibility break.
Catversion bumped, to add the heap table AM and references to it.
Author: Haribabu Kommi, Andres Freund, Alvaro Herrera, Dimitri Golgov and others
Discussion:
https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
https://postgr.es/m/20190107235616.6lur25ph22u5u5av@alap3.anarazel.de
https://postgr.es/m/20190304234700.w5tmhducs5wxgzls@alap3.anarazel.de
2019-03-06 18:54:38 +01:00
|
|
|
/*
|
|
|
|
* Oid of the handler for this relation. For an index this is a function
|
|
|
|
* returning IndexAmRoutine, for table like relations a function returning
|
|
|
|
* TableAmRoutine. This is stored separately from rd_indam, rd_tableam as
|
|
|
|
* its lookup requires syscache access, but during relcache bootstrap we
|
|
|
|
* need to be able to initialize rd_tableam without syscache lookups.
|
|
|
|
*/
|
|
|
|
Oid rd_amhandler; /* OID of index AM's handler function */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Table access method.
|
|
|
|
*/
|
|
|
|
const struct TableAmRoutine *rd_tableam;
|
|
|
|
|
2002-02-19 21:11:20 +01:00
|
|
|
/* These are non-NULL only for an index relation: */
|
|
|
|
Form_pg_index rd_index; /* pg_index tuple describing this index */
|
2010-01-10 23:19:17 +01:00
|
|
|
/* use "struct" here to avoid needing to include htup.h: */
|
2003-05-28 18:04:02 +02:00
|
|
|
struct HeapTupleData *rd_indextuple; /* all of pg_index tuple */
|
2002-02-19 21:11:20 +01:00
|
|
|
|
2003-11-09 22:30:38 +01:00
|
|
|
/*
|
|
|
|
* index access support info (used only for an index relation)
|
|
|
|
*
|
2010-11-29 18:29:42 +01:00
|
|
|
* Note: only default support procs for each opclass are cached, namely
|
|
|
|
* those with lefttype and righttype equal to the opclass's opcintype. The
|
|
|
|
* arrays are indexed by support function number, which is a sufficient
|
|
|
|
* identifier given that restriction.
|
2003-11-09 22:30:38 +01:00
|
|
|
*/
|
2001-10-07 01:21:45 +02:00
|
|
|
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
|
2019-12-27 00:09:00 +01:00
|
|
|
/* use "struct" here to avoid needing to include amapi.h: */
|
2019-01-22 02:36:55 +01:00
|
|
|
struct IndexAmRoutine *rd_indam; /* index AM's API struct */
|
2006-12-23 01:43:13 +01:00
|
|
|
Oid *rd_opfamily; /* OIDs of op families for each index col */
|
|
|
|
Oid *rd_opcintype; /* OIDs of opclass declared input data types */
|
2001-10-07 01:21:45 +02:00
|
|
|
RegProcedure *rd_support; /* OIDs of support procedures */
|
Load relcache entries' partitioning data on-demand, not immediately.
Formerly the rd_partkey and rd_partdesc data structures were always
populated immediately when a relcache entry was built or rebuilt.
This patch changes things so that they are populated only when they
are first requested. (Hence, callers *must* now always use
RelationGetPartitionKey or RelationGetPartitionDesc; just fetching
the pointer directly is no longer acceptable.)
This seems to have some performance benefits, but the main reason to do
it is that it eliminates a recursive-reload failure that occurs if the
partkey or partdesc expressions contain any references to the relation's
rowtype (as discovered by Amit Langote). In retrospect, since loading
these data structures might result in execution of nearly-arbitrary code
via eval_const_expressions, it was a dumb idea to require that to happen
during relcache entry rebuild.
Also, fix things so that old copies of a relcache partition descriptor
will be dropped when the cache entry's refcount goes to zero. In the
previous coding it was possible for such copies to survive for the
lifetime of the session, as I'd complained of in a previous discussion.
(This management technique still isn't perfect, but it's better than
before.) Improve the commentary explaining how that works and why
it's safe to hand out direct pointers to these relcache substructures.
In passing, improve RelationBuildPartitionDesc by using the same
memory-context-parent-swap approach used by RelationBuildPartitionKey,
thereby making it less dependent on strong assumptions about what
partition_bounds_copy does. Avoid doing get_rel_relkind in the
critical section, too.
Patch by Amit Langote and Tom Lane; Robert Haas deserves some credit
for prior work in the area, too. Although this is a pre-existing
problem, no back-patch: the patch seems too invasive to be safe to
back-patch, and the bug it fixes is a corner case that seems
relatively unlikely to cause problems in the field.
Discussion: https://postgr.es/m/CA+HiwqFUzjfj9HEsJtYWcr1SgQ_=iCAvQ=O2Sx6aQxoDu4OiHw@mail.gmail.com
Discussion: https://postgr.es/m/CA+TgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw@mail.gmail.com
2019-12-25 20:43:13 +01:00
|
|
|
struct FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
|
2007-01-09 03:14:16 +01:00
|
|
|
int16 *rd_indoption; /* per-column AM-specific flags */
|
2003-05-28 18:04:02 +02:00
|
|
|
List *rd_indexprs; /* index expression trees, if any */
|
|
|
|
List *rd_indpred; /* index predicate tree, if any */
|
2009-12-07 06:22:23 +01:00
|
|
|
Oid *rd_exclops; /* OIDs of exclusion operators, if any */
|
|
|
|
Oid *rd_exclprocs; /* OIDs of exclusion ops' procs, if any */
|
|
|
|
uint16 *rd_exclstrats; /* exclusion ops' strategy numbers, if any */
|
2011-02-08 22:04:18 +01:00
|
|
|
Oid *rd_indcollation; /* OIDs of index collations */
|
Implement operator class parameters
PostgreSQL provides set of template index access methods, where opclasses have
much freedom in the semantics of indexing. These index AMs are GiST, GIN,
SP-GiST and BRIN. There opclasses define representation of keys, operations on
them and supported search strategies. So, it's natural that opclasses may be
faced some tradeoffs, which require user-side decision. This commit implements
opclass parameters allowing users to set some values, which tell opclass how to
index the particular dataset.
This commit doesn't introduce new storage in system catalog. Instead it uses
pg_attribute.attoptions, which is used for table column storage options but
unused for index attributes.
In order to evade changing signature of each opclass support function, we
implement unified way to pass options to opclass support functions. Options
are set to fn_expr as the constant bytea expression. It's possible due to the
fact that opclass support functions are executed outside of expressions, so
fn_expr is unused for them.
This commit comes with some examples of opclass options usage. We parametrize
signature length in GiST. That applies to multiple opclasses: tsvector_ops,
gist__intbig_ops, gist_ltree_ops, gist__ltree_ops, gist_trgm_ops and
gist_hstore_ops. Also we parametrize maximum number of integer ranges for
gist__int_ops. However, the main future usage of this feature is expected
to be json, where users would be able to specify which way to index particular
json parts.
Catversion is bumped.
Discussion: https://postgr.es/m/d22c3a18-31c7-1879-fc11-4c1ce2f5e5af%40postgrespro.ru
Author: Nikita Glukhov, revised by me
Reviwed-by: Nikolay Shaplov, Robert Haas, Tom Lane, Tomas Vondra, Alvaro Herrera
2020-03-30 18:17:11 +02:00
|
|
|
bytea **rd_opcoptions; /* parsed opclass-specific options */
|
2001-10-07 01:21:45 +02:00
|
|
|
|
2019-07-30 20:43:27 +02:00
|
|
|
/*
|
|
|
|
* rd_amcache is available for index and table AMs to cache private data
|
|
|
|
* about the relation. This must be just a cache since it may get reset
|
|
|
|
* at any time (in particular, it will get reset by a relcache inval
|
|
|
|
* message for the relation). If used, it must point to a single memory
|
|
|
|
* chunk palloc'd in CacheMemoryContext, or in rd_indexcxt for an index
|
|
|
|
* relation. A relcache reset will include freeing that chunk and setting
|
|
|
|
* rd_amcache = NULL.
|
|
|
|
*/
|
|
|
|
void *rd_amcache; /* available for use by index/table AM */
|
|
|
|
|
2013-03-07 05:47:38 +01:00
|
|
|
/*
|
|
|
|
* foreign-table support
|
|
|
|
*
|
|
|
|
* rd_fdwroutine must point to a single memory chunk palloc'd in
|
|
|
|
* CacheMemoryContext. It will be freed and reset to NULL on a relcache
|
|
|
|
* reset.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* use "struct" here to avoid needing to include fdwapi.h: */
|
|
|
|
struct FdwRoutine *rd_fdwroutine; /* cached function pointers, or NULL */
|
|
|
|
|
2010-02-04 01:09:14 +01:00
|
|
|
/*
|
|
|
|
* Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new
|
|
|
|
* version of a table, we need to make any toast pointers inserted into it
|
|
|
|
* have the existing toast table's OID, not the OID of the transient toast
|
|
|
|
* table. If rd_toastoid isn't InvalidOid, it is the OID to place in
|
|
|
|
* toast pointers inserted into this rel. (Note it's set on the new
|
Fix CLUSTER/VACUUM FULL for toast values owned by recently-updated rows.
In commit 7b0d0e9356963d5c3e4d329a917f5fbb82a2ef05, I made CLUSTER and
VACUUM FULL try to preserve toast value OIDs from the original toast table
to the new one. However, if we have to copy both live and recently-dead
versions of a row that has a toasted column, those versions may well
reference the same toast value with the same OID. The patch then led to
duplicate-key failures as we tried to insert the toast value twice with the
same OID. (The previous behavior was not very desirable either, since it
would have silently inserted the same value twice with different OIDs.
That wastes space, but what's worse is that the toast values inserted for
already-dead heap rows would not be reclaimed by subsequent ordinary
VACUUMs, since they go into the new toast table marked live not deleted.)
To fix, check if the copied OID already exists in the new toast table, and
if so, assume that it stores the desired value. This is reasonably safe
since the only case where we will copy an OID from a previous toast pointer
is when toast_insert_or_update was given that toast pointer and so we just
pulled the data from the old table; if we got two different values that way
then we have big problems anyway. We do have to assume that no other
backend is inserting items into the new toast table concurrently, but
that's surely safe for CLUSTER and VACUUM FULL.
Per bug #6393 from Maxim Boguk. Back-patch to 9.0, same as the previous
patch.
2012-01-12 22:40:14 +01:00
|
|
|
* version of the main heap, not the toast table itself.) This also
|
|
|
|
* causes toast_save_datum() to try to preserve toast value OIDs.
|
2010-02-04 01:09:14 +01:00
|
|
|
*/
|
|
|
|
Oid rd_toastoid; /* Real TOAST table's OID, or InvalidOid */
|
|
|
|
|
2007-05-27 05:50:39 +02:00
|
|
|
/* use "struct" here to avoid needing to include pgstat.h: */
|
|
|
|
struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
|
1996-08-28 03:59:28 +02:00
|
|
|
} RelationData;
|
1997-09-07 07:04:48 +02:00
|
|
|
|
2016-06-18 21:22:34 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ForeignKeyCacheInfo
|
|
|
|
* Information the relcache can cache about foreign key constraints
|
|
|
|
*
|
|
|
|
* This is basically just an image of relevant columns from pg_constraint.
|
|
|
|
* We make it a subclass of Node so that copyObject() can be used on a list
|
|
|
|
* of these, but we also ensure it is a "flat" object without substructure,
|
|
|
|
* so that list_free_deep() is sufficient to free such a list.
|
|
|
|
* The per-FK-column arrays can be fixed-size because we allow at most
|
|
|
|
* INDEX_MAX_KEYS columns in a foreign key constraint.
|
|
|
|
*
|
Correct attach/detach logic for FKs in partitions
There was no code to handle foreign key constraints on partitioned
tables in the case of ALTER TABLE DETACH; and if you happened to ATTACH
a partition that already had an equivalent constraint, that one was
ignored and a new constraint was created. Adding this to the fact that
foreign key cloning reuses the constraint name on the partition instead
of generating a new name (as it probably should, to cater to SQL
standard rules about constraint naming within schemas), the result was a
pretty poor user experience -- the most visible failure was that just
detaching a partition and re-attaching it failed with an error such as
ERROR: duplicate key value violates unique constraint "pg_constraint_conrelid_contypid_conname_index"
DETAIL: Key (conrelid, contypid, conname)=(26702, 0, test_result_asset_id_fkey) already exists.
because it would try to create an identically-named constraint in the
partition. To make matters worse, if you tried to drop the constraint
in the now-independent partition, that would fail because the constraint
was still seen as dependent on the constraint in its former parent
partitioned table:
ERROR: cannot drop inherited constraint "test_result_asset_id_fkey" of relation "test_result_cbsystem_0001_0050_monthly_2018_09"
This fix attacks the problem from two angles: first, when the partition
is detached, the constraint is also marked as independent, so the drop
now works. Second, when the partition is re-attached, we scan existing
constraints searching for one matching the FK in the parent, and if one
exists, we link that one to the parent constraint. So we don't end up
with a duplicate -- and better yet, we don't need to scan the referenced
table to verify that the constraint holds.
To implement this I made a small change to previously planner-only
struct ForeignKeyCacheInfo to contain the constraint OID; also relcache
now maintains the list of FKs for partitioned tables too.
Backpatch to 11.
Reported-by: Michael Vitale (bug #15425)
Discussion: https://postgr.es/m/15425-2dbc9d2aa999f816@postgresql.org
2018-10-12 17:36:26 +02:00
|
|
|
* Currently, we mostly cache fields of interest to the planner, but the set
|
|
|
|
* of fields has already grown the constraint OID for other uses.
|
2016-06-18 21:22:34 +02:00
|
|
|
*/
|
|
|
|
typedef struct ForeignKeyCacheInfo
|
|
|
|
{
|
|
|
|
NodeTag type;
|
Correct attach/detach logic for FKs in partitions
There was no code to handle foreign key constraints on partitioned
tables in the case of ALTER TABLE DETACH; and if you happened to ATTACH
a partition that already had an equivalent constraint, that one was
ignored and a new constraint was created. Adding this to the fact that
foreign key cloning reuses the constraint name on the partition instead
of generating a new name (as it probably should, to cater to SQL
standard rules about constraint naming within schemas), the result was a
pretty poor user experience -- the most visible failure was that just
detaching a partition and re-attaching it failed with an error such as
ERROR: duplicate key value violates unique constraint "pg_constraint_conrelid_contypid_conname_index"
DETAIL: Key (conrelid, contypid, conname)=(26702, 0, test_result_asset_id_fkey) already exists.
because it would try to create an identically-named constraint in the
partition. To make matters worse, if you tried to drop the constraint
in the now-independent partition, that would fail because the constraint
was still seen as dependent on the constraint in its former parent
partitioned table:
ERROR: cannot drop inherited constraint "test_result_asset_id_fkey" of relation "test_result_cbsystem_0001_0050_monthly_2018_09"
This fix attacks the problem from two angles: first, when the partition
is detached, the constraint is also marked as independent, so the drop
now works. Second, when the partition is re-attached, we scan existing
constraints searching for one matching the FK in the parent, and if one
exists, we link that one to the parent constraint. So we don't end up
with a duplicate -- and better yet, we don't need to scan the referenced
table to verify that the constraint holds.
To implement this I made a small change to previously planner-only
struct ForeignKeyCacheInfo to contain the constraint OID; also relcache
now maintains the list of FKs for partitioned tables too.
Backpatch to 11.
Reported-by: Michael Vitale (bug #15425)
Discussion: https://postgr.es/m/15425-2dbc9d2aa999f816@postgresql.org
2018-10-12 17:36:26 +02:00
|
|
|
Oid conoid; /* oid of the constraint itself */
|
2016-06-18 21:22:34 +02:00
|
|
|
Oid conrelid; /* relation constrained by the foreign key */
|
|
|
|
Oid confrelid; /* relation referenced by the foreign key */
|
|
|
|
int nkeys; /* number of columns in the foreign key */
|
|
|
|
/* these arrays each have nkeys valid entries: */
|
|
|
|
AttrNumber conkey[INDEX_MAX_KEYS]; /* cols in referencing table */
|
|
|
|
AttrNumber confkey[INDEX_MAX_KEYS]; /* cols in referenced table */
|
|
|
|
Oid conpfeqop[INDEX_MAX_KEYS]; /* PK = FK operator OIDs */
|
|
|
|
} ForeignKeyCacheInfo;
|
|
|
|
|
|
|
|
|
2006-07-04 00:45:41 +02:00
|
|
|
/*
|
|
|
|
* StdRdOptions
|
2019-11-25 01:40:53 +01:00
|
|
|
* Standard contents of rd_options for heaps.
|
2006-07-04 00:45:41 +02:00
|
|
|
*
|
|
|
|
* RelationGetFillFactor() and RelationGetTargetPageFreeSpace() can only
|
|
|
|
* be applied to relations that use this format or a superset for
|
|
|
|
* private options data.
|
|
|
|
*/
|
2009-02-09 21:57:59 +01:00
|
|
|
/* autovacuum-related reloptions. */
|
|
|
|
typedef struct AutoVacOpts
|
|
|
|
{
|
|
|
|
bool enabled;
|
|
|
|
int vacuum_threshold;
|
Trigger autovacuum based on number of INSERTs
Traditionally autovacuum has only ever invoked a worker based on the
estimated number of dead tuples in a table and for anti-wraparound
purposes. For the latter, with certain classes of tables such as
insert-only tables, anti-wraparound vacuums could be the first vacuum that
the table ever receives. This could often lead to autovacuum workers being
busy for extended periods of time due to having to potentially freeze
every page in the table. This could be particularly bad for very large
tables. New clusters, or recently pg_restored clusters could suffer even
more as many large tables may have the same relfrozenxid, which could
result in large numbers of tables requiring an anti-wraparound vacuum all
at once.
Here we aim to reduce the work required by anti-wraparound and aggressive
vacuums in general, by triggering autovacuum when the table has received
enough INSERTs. This is controlled by adding two new GUCs and reloptions;
autovacuum_vacuum_insert_threshold and
autovacuum_vacuum_insert_scale_factor. These work exactly the same as the
existing scale factor and threshold controls, only base themselves off the
number of inserts since the last vacuum, rather than the number of dead
tuples. New controls were added rather than reusing the existing
controls, to allow these new vacuums to be tuned independently and perhaps
even completely disabled altogether, which can be done by setting
autovacuum_vacuum_insert_threshold to -1.
We make no attempt to skip index cleanup operations on these vacuums as
they may trigger for an insert-mostly table which continually doesn't have
enough dead tuples to trigger an autovacuum for the purpose of removing
those dead tuples. If we were to skip cleaning the indexes in this case,
then it is possible for the index(es) to become bloated over time.
There are additional benefits to triggering autovacuums based on inserts,
as tables which never contain enough dead tuples to trigger an autovacuum
are now more likely to receive a vacuum, which can mark more of the table
as "allvisible" and encourage the query planner to make use of Index Only
Scans.
Currently, we still obey vacuum_freeze_min_age when triggering these new
autovacuums based on INSERTs. For large insert-only tables, it may be
beneficial to lower the table's autovacuum_freeze_min_age so that tuples
are eligible to be frozen sooner. Here we've opted not to zero that for
these types of vacuums, since the table may just be insert-mostly and we
may otherwise freeze tuples that are still destined to be updated or
removed in the near future.
There was some debate to what exactly the new scale factor and threshold
should default to. For now, these are set to 0.2 and 1000, respectively.
There may be some motivation to adjust these before the release.
Author: Laurenz Albe, Darafei Praliaskouski
Reviewed-by: Alvaro Herrera, Masahiko Sawada, Chris Travers, Andres Freund, Justin Pryzby
Discussion: https://postgr.es/m/CAC8Q8t%2Bj36G_bLF%3D%2B0iMo6jGNWnLnWb1tujXuJr-%2Bx8ZCCTqoQ%40mail.gmail.com
2020-03-28 07:20:12 +01:00
|
|
|
int vacuum_ins_threshold;
|
2009-02-09 21:57:59 +01:00
|
|
|
int analyze_threshold;
|
|
|
|
int vacuum_cost_limit;
|
|
|
|
int freeze_min_age;
|
|
|
|
int freeze_max_age;
|
|
|
|
int freeze_table_age;
|
Separate multixact freezing parameters from xid's
Previously we were piggybacking on transaction ID parameters to freeze
multixacts; but since there isn't necessarily any relationship between
rates of Xid and multixact consumption, this turns out not to be a good
idea.
Therefore, we now have multixact-specific freezing parameters:
vacuum_multixact_freeze_min_age: when to remove multis as we come across
them in vacuum (default to 5 million, i.e. early in comparison to Xid's
default of 50 million)
vacuum_multixact_freeze_table_age: when to force whole-table scans
instead of scanning only the pages marked as not all visible in
visibility map (default to 150 million, same as for Xids). Whichever of
both which reaches the 150 million mark earlier will cause a whole-table
scan.
autovacuum_multixact_freeze_max_age: when for cause emergency,
uninterruptible whole-table scans (default to 400 million, double as
that for Xids). This means there shouldn't be more frequent emergency
vacuuming than previously, unless multixacts are being used very
rapidly.
Backpatch to 9.3 where multixacts were made to persist enough to require
freezing. To avoid an ABI break in 9.3, VacuumStmt has a couple of
fields in an unnatural place, and StdRdOptions is split in two so that
the newly added fields can go at the end.
Patch by me, reviewed by Robert Haas, with additional input from Andres
Freund and Tom Lane.
2014-02-13 23:30:30 +01:00
|
|
|
int multixact_freeze_min_age;
|
|
|
|
int multixact_freeze_max_age;
|
|
|
|
int multixact_freeze_table_age;
|
2015-04-03 16:55:50 +02:00
|
|
|
int log_min_duration;
|
2019-03-10 20:01:39 +01:00
|
|
|
float8 vacuum_cost_delay;
|
2009-02-09 21:57:59 +01:00
|
|
|
float8 vacuum_scale_factor;
|
Trigger autovacuum based on number of INSERTs
Traditionally autovacuum has only ever invoked a worker based on the
estimated number of dead tuples in a table and for anti-wraparound
purposes. For the latter, with certain classes of tables such as
insert-only tables, anti-wraparound vacuums could be the first vacuum that
the table ever receives. This could often lead to autovacuum workers being
busy for extended periods of time due to having to potentially freeze
every page in the table. This could be particularly bad for very large
tables. New clusters, or recently pg_restored clusters could suffer even
more as many large tables may have the same relfrozenxid, which could
result in large numbers of tables requiring an anti-wraparound vacuum all
at once.
Here we aim to reduce the work required by anti-wraparound and aggressive
vacuums in general, by triggering autovacuum when the table has received
enough INSERTs. This is controlled by adding two new GUCs and reloptions;
autovacuum_vacuum_insert_threshold and
autovacuum_vacuum_insert_scale_factor. These work exactly the same as the
existing scale factor and threshold controls, only base themselves off the
number of inserts since the last vacuum, rather than the number of dead
tuples. New controls were added rather than reusing the existing
controls, to allow these new vacuums to be tuned independently and perhaps
even completely disabled altogether, which can be done by setting
autovacuum_vacuum_insert_threshold to -1.
We make no attempt to skip index cleanup operations on these vacuums as
they may trigger for an insert-mostly table which continually doesn't have
enough dead tuples to trigger an autovacuum for the purpose of removing
those dead tuples. If we were to skip cleaning the indexes in this case,
then it is possible for the index(es) to become bloated over time.
There are additional benefits to triggering autovacuums based on inserts,
as tables which never contain enough dead tuples to trigger an autovacuum
are now more likely to receive a vacuum, which can mark more of the table
as "allvisible" and encourage the query planner to make use of Index Only
Scans.
Currently, we still obey vacuum_freeze_min_age when triggering these new
autovacuums based on INSERTs. For large insert-only tables, it may be
beneficial to lower the table's autovacuum_freeze_min_age so that tuples
are eligible to be frozen sooner. Here we've opted not to zero that for
these types of vacuums, since the table may just be insert-mostly and we
may otherwise freeze tuples that are still destined to be updated or
removed in the near future.
There was some debate to what exactly the new scale factor and threshold
should default to. For now, these are set to 0.2 and 1000, respectively.
There may be some motivation to adjust these before the release.
Author: Laurenz Albe, Darafei Praliaskouski
Reviewed-by: Alvaro Herrera, Masahiko Sawada, Chris Travers, Andres Freund, Justin Pryzby
Discussion: https://postgr.es/m/CAC8Q8t%2Bj36G_bLF%3D%2B0iMo6jGNWnLnWb1tujXuJr-%2Bx8ZCCTqoQ%40mail.gmail.com
2020-03-28 07:20:12 +01:00
|
|
|
float8 vacuum_ins_scale_factor;
|
2009-02-09 21:57:59 +01:00
|
|
|
float8 analyze_scale_factor;
|
|
|
|
} AutoVacOpts;
|
|
|
|
|
2021-06-19 05:04:07 +02:00
|
|
|
/* StdRdOptions->vacuum_index_cleanup values */
|
|
|
|
typedef enum StdRdOptIndexCleanup
|
|
|
|
{
|
|
|
|
STDRD_OPTION_VACUUM_INDEX_CLEANUP_AUTO = 0,
|
|
|
|
STDRD_OPTION_VACUUM_INDEX_CLEANUP_OFF,
|
|
|
|
STDRD_OPTION_VACUUM_INDEX_CLEANUP_ON
|
|
|
|
} StdRdOptIndexCleanup;
|
|
|
|
|
2006-07-04 00:45:41 +02:00
|
|
|
typedef struct StdRdOptions
|
|
|
|
{
|
2007-02-28 00:48:10 +01:00
|
|
|
int32 vl_len_; /* varlena header (do not touch directly!) */
|
2006-07-04 00:45:41 +02:00
|
|
|
int fillfactor; /* page fill factor in percent (0..100) */
|
Skip full index scan during cleanup of B-tree indexes when possible
Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.
This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).
This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.
Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com
2018-04-04 18:29:00 +02:00
|
|
|
/* fraction of newly inserted tuples prior to trigger index cleanup */
|
2017-11-19 23:50:10 +01:00
|
|
|
int toast_tuple_target; /* target for tuple toasting */
|
2009-02-09 21:57:59 +01:00
|
|
|
AutoVacOpts autovacuum; /* autovacuum-related options */
|
2013-12-11 01:17:34 +01:00
|
|
|
bool user_catalog_table; /* use as an additional catalog relation */
|
2016-06-09 15:08:27 +02:00
|
|
|
int parallel_workers; /* max number of parallel workers */
|
2021-06-19 05:04:07 +02:00
|
|
|
StdRdOptIndexCleanup vacuum_index_cleanup; /* controls index vacuuming */
|
2019-04-08 09:43:57 +02:00
|
|
|
bool vacuum_truncate; /* enables vacuum to truncate a relation */
|
2006-07-04 00:45:41 +02:00
|
|
|
} StdRdOptions;
|
|
|
|
|
|
|
|
#define HEAP_MIN_FILLFACTOR 10
|
|
|
|
#define HEAP_DEFAULT_FILLFACTOR 100
|
|
|
|
|
2017-11-19 23:50:10 +01:00
|
|
|
/*
|
|
|
|
* RelationGetToastTupleTarget
|
|
|
|
* Returns the relation's toast_tuple_target. Note multiple eval of argument!
|
|
|
|
*/
|
|
|
|
#define RelationGetToastTupleTarget(relation, defaulttarg) \
|
|
|
|
((relation)->rd_options ? \
|
|
|
|
((StdRdOptions *) (relation)->rd_options)->toast_tuple_target : (defaulttarg))
|
|
|
|
|
2006-07-04 00:45:41 +02:00
|
|
|
/*
|
|
|
|
* RelationGetFillFactor
|
|
|
|
* Returns the relation's fillfactor. Note multiple eval of argument!
|
|
|
|
*/
|
|
|
|
#define RelationGetFillFactor(relation, defaultff) \
|
|
|
|
((relation)->rd_options ? \
|
|
|
|
((StdRdOptions *) (relation)->rd_options)->fillfactor : (defaultff))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationGetTargetPageUsage
|
|
|
|
* Returns the relation's desired space usage per page in bytes.
|
|
|
|
*/
|
|
|
|
#define RelationGetTargetPageUsage(relation, defaultff) \
|
|
|
|
(BLCKSZ * RelationGetFillFactor(relation, defaultff) / 100)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationGetTargetPageFreeSpace
|
|
|
|
* Returns the relation's desired freespace per page in bytes.
|
|
|
|
*/
|
|
|
|
#define RelationGetTargetPageFreeSpace(relation, defaultff) \
|
|
|
|
(BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100)
|
|
|
|
|
2014-07-14 23:24:40 +02:00
|
|
|
/*
|
|
|
|
* RelationIsUsedAsCatalogTable
|
|
|
|
* Returns whether the relation should be treated as a catalog table
|
2016-06-09 17:16:26 +02:00
|
|
|
* from the pov of logical decoding. Note multiple eval of argument!
|
2014-07-14 23:24:40 +02:00
|
|
|
*/
|
|
|
|
#define RelationIsUsedAsCatalogTable(relation) \
|
2016-11-10 21:00:58 +01:00
|
|
|
((relation)->rd_options && \
|
|
|
|
((relation)->rd_rel->relkind == RELKIND_RELATION || \
|
|
|
|
(relation)->rd_rel->relkind == RELKIND_MATVIEW) ? \
|
2014-07-14 23:24:40 +02:00
|
|
|
((StdRdOptions *) (relation)->rd_options)->user_catalog_table : false)
|
|
|
|
|
2016-04-08 17:14:56 +02:00
|
|
|
/*
|
2016-06-09 17:16:26 +02:00
|
|
|
* RelationGetParallelWorkers
|
|
|
|
* Returns the relation's parallel_workers reloption setting.
|
|
|
|
* Note multiple eval of argument!
|
2016-04-08 17:14:56 +02:00
|
|
|
*/
|
2016-06-09 17:16:26 +02:00
|
|
|
#define RelationGetParallelWorkers(relation, defaultpw) \
|
2016-04-08 17:14:56 +02:00
|
|
|
((relation)->rd_options ? \
|
2016-06-09 17:16:26 +02:00
|
|
|
((StdRdOptions *) (relation)->rd_options)->parallel_workers : (defaultpw))
|
2016-04-08 17:14:56 +02:00
|
|
|
|
2019-09-25 20:56:52 +02:00
|
|
|
/* ViewOptions->check_option values */
|
|
|
|
typedef enum ViewOptCheckOption
|
|
|
|
{
|
|
|
|
VIEW_OPTION_CHECK_OPTION_NOT_SET,
|
|
|
|
VIEW_OPTION_CHECK_OPTION_LOCAL,
|
|
|
|
VIEW_OPTION_CHECK_OPTION_CASCADED
|
|
|
|
} ViewOptCheckOption;
|
2014-07-14 23:24:40 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ViewOptions
|
|
|
|
* Contents of rd_options for views
|
|
|
|
*/
|
|
|
|
typedef struct ViewOptions
|
|
|
|
{
|
|
|
|
int32 vl_len_; /* varlena header (do not touch directly!) */
|
|
|
|
bool security_barrier;
|
2019-09-25 20:56:52 +02:00
|
|
|
ViewOptCheckOption check_option;
|
2014-07-14 23:24:40 +02:00
|
|
|
} ViewOptions;
|
|
|
|
|
2011-12-22 22:15:57 +01:00
|
|
|
/*
|
|
|
|
* RelationIsSecurityView
|
2014-07-14 23:24:40 +02:00
|
|
|
* Returns whether the relation is security view, or not. Note multiple
|
|
|
|
* eval of argument!
|
2011-12-22 22:15:57 +01:00
|
|
|
*/
|
2019-11-01 13:16:21 +01:00
|
|
|
#define RelationIsSecurityView(relation) \
|
|
|
|
(AssertMacro(relation->rd_rel->relkind == RELKIND_VIEW), \
|
|
|
|
(relation)->rd_options ? \
|
|
|
|
((ViewOptions *) (relation)->rd_options)->security_barrier : false)
|
2011-12-22 22:15:57 +01:00
|
|
|
|
2013-07-18 23:10:16 +02:00
|
|
|
/*
|
|
|
|
* RelationHasCheckOption
|
|
|
|
* Returns true if the relation is a view defined with either the local
|
2014-07-14 23:24:40 +02:00
|
|
|
* or the cascaded check option. Note multiple eval of argument!
|
2013-07-18 23:10:16 +02:00
|
|
|
*/
|
|
|
|
#define RelationHasCheckOption(relation) \
|
2019-11-01 13:16:21 +01:00
|
|
|
(AssertMacro(relation->rd_rel->relkind == RELKIND_VIEW), \
|
|
|
|
(relation)->rd_options && \
|
2019-09-25 20:56:52 +02:00
|
|
|
((ViewOptions *) (relation)->rd_options)->check_option != \
|
|
|
|
VIEW_OPTION_CHECK_OPTION_NOT_SET)
|
2013-07-18 23:10:16 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationHasLocalCheckOption
|
|
|
|
* Returns true if the relation is a view defined with the local check
|
2014-07-14 23:24:40 +02:00
|
|
|
* option. Note multiple eval of argument!
|
2013-07-18 23:10:16 +02:00
|
|
|
*/
|
|
|
|
#define RelationHasLocalCheckOption(relation) \
|
2019-11-01 13:16:21 +01:00
|
|
|
(AssertMacro(relation->rd_rel->relkind == RELKIND_VIEW), \
|
|
|
|
(relation)->rd_options && \
|
2019-09-25 20:56:52 +02:00
|
|
|
((ViewOptions *) (relation)->rd_options)->check_option == \
|
|
|
|
VIEW_OPTION_CHECK_OPTION_LOCAL)
|
2013-07-18 23:10:16 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationHasCascadedCheckOption
|
|
|
|
* Returns true if the relation is a view defined with the cascaded check
|
2014-07-14 23:24:40 +02:00
|
|
|
* option. Note multiple eval of argument!
|
2013-07-18 23:10:16 +02:00
|
|
|
*/
|
|
|
|
#define RelationHasCascadedCheckOption(relation) \
|
2019-11-01 13:16:21 +01:00
|
|
|
(AssertMacro(relation->rd_rel->relkind == RELKIND_VIEW), \
|
|
|
|
(relation)->rd_options && \
|
2019-09-25 20:56:52 +02:00
|
|
|
((ViewOptions *) (relation)->rd_options)->check_option == \
|
|
|
|
VIEW_OPTION_CHECK_OPTION_CASCADED)
|
2013-12-11 01:17:34 +01:00
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/*
|
1999-02-14 00:22:53 +01:00
|
|
|
* RelationIsValid
|
1996-08-28 03:59:28 +02:00
|
|
|
* True iff relation descriptor is valid.
|
|
|
|
*/
|
|
|
|
#define RelationIsValid(relation) PointerIsValid(relation)
|
|
|
|
|
1999-09-18 21:08:25 +02:00
|
|
|
#define InvalidRelation ((Relation) NULL)
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/*
|
1999-02-14 00:22:53 +01:00
|
|
|
* RelationHasReferenceCountZero
|
1996-08-28 03:59:28 +02:00
|
|
|
* True iff relation reference count is zero.
|
|
|
|
*
|
|
|
|
* Note:
|
|
|
|
* Assumes relation descriptor is valid.
|
|
|
|
*/
|
|
|
|
#define RelationHasReferenceCountZero(relation) \
|
|
|
|
((bool)((relation)->rd_refcnt == 0))
|
|
|
|
|
|
|
|
/*
|
1999-02-14 00:22:53 +01:00
|
|
|
* RelationGetForm
|
2000-01-31 05:35:57 +01:00
|
|
|
* Returns pg_class tuple for a relation.
|
1996-08-28 03:59:28 +02:00
|
|
|
*
|
|
|
|
* Note:
|
|
|
|
* Assumes relation descriptor is valid.
|
|
|
|
*/
|
|
|
|
#define RelationGetForm(relation) ((relation)->rd_rel)
|
|
|
|
|
|
|
|
/*
|
1999-02-14 00:22:53 +01:00
|
|
|
* RelationGetRelid
|
2004-01-06 19:07:32 +01:00
|
|
|
* Returns the OID of the relation
|
1996-08-28 03:59:28 +02:00
|
|
|
*/
|
1998-08-19 04:04:17 +02:00
|
|
|
#define RelationGetRelid(relation) ((relation)->rd_id)
|
1996-08-28 03:59:28 +02:00
|
|
|
|
|
|
|
/*
|
1999-09-18 21:08:25 +02:00
|
|
|
* RelationGetNumberOfAttributes
|
2018-04-07 22:00:39 +02:00
|
|
|
* Returns the total number of attributes in a relation.
|
1996-08-28 03:59:28 +02:00
|
|
|
*/
|
|
|
|
#define RelationGetNumberOfAttributes(relation) ((relation)->rd_rel->relnatts)
|
|
|
|
|
2018-04-07 22:00:39 +02:00
|
|
|
/*
|
|
|
|
* IndexRelationGetNumberOfAttributes
|
|
|
|
* Returns the number of attributes in an index.
|
|
|
|
*/
|
|
|
|
#define IndexRelationGetNumberOfAttributes(relation) \
|
|
|
|
((relation)->rd_index->indnatts)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IndexRelationGetNumberOfKeyAttributes
|
|
|
|
* Returns the number of key attributes in an index.
|
|
|
|
*/
|
|
|
|
#define IndexRelationGetNumberOfKeyAttributes(relation) \
|
|
|
|
((relation)->rd_index->indnkeyatts)
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
/*
|
1999-02-14 00:22:53 +01:00
|
|
|
* RelationGetDescr
|
1996-08-28 03:59:28 +02:00
|
|
|
* Returns tuple descriptor for a relation.
|
|
|
|
*/
|
|
|
|
#define RelationGetDescr(relation) ((relation)->rd_att)
|
|
|
|
|
2001-06-19 14:03:41 +02:00
|
|
|
/*
|
|
|
|
* RelationGetRelationName
|
2004-01-06 19:07:32 +01:00
|
|
|
* Returns the rel's name.
|
2001-06-19 14:03:41 +02:00
|
|
|
*
|
2002-03-31 08:26:32 +02:00
|
|
|
* Note that the name is only unique within the containing namespace.
|
2001-06-19 14:03:41 +02:00
|
|
|
*/
|
|
|
|
#define RelationGetRelationName(relation) \
|
2002-03-31 08:26:32 +02:00
|
|
|
(NameStr((relation)->rd_rel->relname))
|
2001-06-19 14:03:41 +02:00
|
|
|
|
2002-03-26 20:17:02 +01:00
|
|
|
/*
|
|
|
|
* RelationGetNamespace
|
2004-01-06 19:07:32 +01:00
|
|
|
* Returns the rel's namespace OID.
|
2002-03-26 20:17:02 +01:00
|
|
|
*/
|
|
|
|
#define RelationGetNamespace(relation) \
|
|
|
|
((relation)->rd_rel->relnamespace)
|
|
|
|
|
2010-02-07 21:48:13 +01:00
|
|
|
/*
|
|
|
|
* RelationIsMapped
|
2019-01-04 18:51:17 +01:00
|
|
|
* True if the relation uses the relfilenode map. Note multiple eval
|
|
|
|
* of argument!
|
2010-02-07 21:48:13 +01:00
|
|
|
*/
|
|
|
|
#define RelationIsMapped(relation) \
|
2019-01-04 18:51:17 +01:00
|
|
|
(RELKIND_HAS_STORAGE((relation)->rd_rel->relkind) && \
|
|
|
|
((relation)->rd_rel->relfilenode == InvalidOid))
|
2010-02-07 21:48:13 +01:00
|
|
|
|
2005-01-10 21:02:24 +01:00
|
|
|
/*
|
2021-07-12 23:01:29 +02:00
|
|
|
* RelationGetSmgr
|
|
|
|
* Returns smgr file handle for a relation, opening it if needed.
|
|
|
|
*
|
|
|
|
* Very little code is authorized to touch rel->rd_smgr directly. Instead
|
|
|
|
* use this function to fetch its value.
|
|
|
|
*
|
|
|
|
* Note: since a relcache flush can cause the file handle to be closed again,
|
|
|
|
* it's unwise to hold onto the pointer returned by this function for any
|
|
|
|
* long period. Recommended practice is to just re-execute RelationGetSmgr
|
|
|
|
* each time you need to access the SMgrRelation. It's quite cheap in
|
|
|
|
* comparison to whatever an smgr function is going to do.
|
|
|
|
*/
|
|
|
|
static inline SMgrRelation
|
|
|
|
RelationGetSmgr(Relation rel)
|
|
|
|
{
|
|
|
|
if (unlikely(rel->rd_smgr == NULL))
|
|
|
|
smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_node, rel->rd_backend));
|
|
|
|
return rel->rd_smgr;
|
|
|
|
}
|
2005-01-10 21:02:24 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationCloseSmgr
|
|
|
|
* Close the relation at the smgr level, if not already done.
|
|
|
|
*
|
|
|
|
* Note: smgrclose should unhook from owner pointer, hence the Assert.
|
|
|
|
*/
|
|
|
|
#define RelationCloseSmgr(relation) \
|
|
|
|
do { \
|
|
|
|
if ((relation)->rd_smgr != NULL) \
|
|
|
|
{ \
|
|
|
|
smgrclose((relation)->rd_smgr); \
|
|
|
|
Assert((relation)->rd_smgr == NULL); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
2010-02-09 22:43:30 +01:00
|
|
|
/*
|
|
|
|
* RelationGetTargetBlock
|
|
|
|
* Fetch relation's current insertion target block.
|
|
|
|
*
|
|
|
|
* Returns InvalidBlockNumber if there is no current target block. Note
|
2021-07-12 23:01:29 +02:00
|
|
|
* that the target block status is discarded on any smgr-level invalidation,
|
|
|
|
* so there's no need to re-open the smgr handle if it's not currently open.
|
2010-02-09 22:43:30 +01:00
|
|
|
*/
|
|
|
|
#define RelationGetTargetBlock(relation) \
|
|
|
|
( (relation)->rd_smgr != NULL ? (relation)->rd_smgr->smgr_targblock : InvalidBlockNumber )
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationSetTargetBlock
|
|
|
|
* Set relation's current insertion target block.
|
|
|
|
*/
|
|
|
|
#define RelationSetTargetBlock(relation, targblock) \
|
|
|
|
do { \
|
2021-07-12 23:01:29 +02:00
|
|
|
RelationGetSmgr(relation)->smgr_targblock = (targblock); \
|
2010-02-09 22:43:30 +01:00
|
|
|
} while (0)
|
|
|
|
|
2021-03-23 01:22:48 +01:00
|
|
|
/*
|
|
|
|
* RelationIsPermanent
|
|
|
|
* True if relation is permanent.
|
|
|
|
*/
|
|
|
|
#define RelationIsPermanent(relation) \
|
|
|
|
((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
|
|
|
|
|
2010-12-13 18:34:26 +01:00
|
|
|
/*
|
|
|
|
* RelationNeedsWAL
|
|
|
|
* True if relation needs WAL.
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
*
|
|
|
|
* Returns false if wal_level = minimal and this relation is created or
|
|
|
|
* truncated in the current transaction. See "Skipping WAL for New
|
|
|
|
* RelFileNode" in src/backend/access/transam/README.
|
|
|
|
*/
|
|
|
|
#define RelationNeedsWAL(relation) \
|
2021-03-23 01:22:48 +01:00
|
|
|
(RelationIsPermanent(relation) && (XLogIsNeeded() || \
|
Skip WAL for new relfilenodes, under wal_level=minimal.
Until now, only selected bulk operations (e.g. COPY) did this. If a
given relfilenode received both a WAL-skipping COPY and a WAL-logged
operation (e.g. INSERT), recovery could lose tuples from the COPY. See
src/backend/access/transam/README section "Skipping WAL for New
RelFileNode" for the new coding rules. Maintainers of table access
methods should examine that section.
To maintain data durability, just before commit, we choose between an
fsync of the relfilenode and copying its contents to WAL. A new GUC,
wal_skip_threshold, guides that choice. If this change slows a workload
that creates small, permanent relfilenodes under wal_level=minimal, try
adjusting wal_skip_threshold. Users setting a timeout on COMMIT may
need to adjust that timeout, and log_min_duration_statement analysis
will reflect time consumption moving to COMMIT from commands like COPY.
Internally, this requires a reliable determination of whether
RollbackAndReleaseCurrentSubTransaction() would unlink a relation's
current relfilenode. Introduce rd_firstRelfilenodeSubid. Amend the
specification of rd_createSubid such that the field is zero when a new
rel has an old rd_node. Make relcache.c retain entries for certain
dropped relations until end of transaction.
Bump XLOG_PAGE_MAGIC, since this introduces XLOG_GIST_ASSIGN_LSN.
Future servers accept older WAL, so this bump is discretionary.
Kyotaro Horiguchi, reviewed (in earlier, similar versions) by Robert
Haas. Heikki Linnakangas and Michael Paquier implemented earlier
designs that materially clarified the problem. Reviewed, in earlier
designs, by Andrew Dunstan, Andres Freund, Alvaro Herrera, Tom Lane,
Fujii Masao, and Simon Riggs. Reported by Martijn van Oosterhout.
Discussion: https://postgr.es/m/20150702220524.GA9392@svana.org
2020-04-04 21:25:34 +02:00
|
|
|
(relation->rd_createSubid == InvalidSubTransactionId && \
|
|
|
|
relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))
|
2010-12-13 18:34:26 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationUsesLocalBuffers
|
|
|
|
* True if relation's pages are stored in local buffers.
|
|
|
|
*/
|
|
|
|
#define RelationUsesLocalBuffers(relation) \
|
|
|
|
((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
|
|
|
|
|
2004-08-28 22:31:44 +02:00
|
|
|
/*
|
|
|
|
* RELATION_IS_LOCAL
|
|
|
|
* If a rel is either temp or newly created in the current transaction,
|
2012-12-18 02:15:32 +01:00
|
|
|
* it can be assumed to be accessible only to the current backend.
|
|
|
|
* This is typically used to decide that we can skip acquiring locks.
|
2004-08-28 22:31:44 +02:00
|
|
|
*
|
|
|
|
* Beware of multiple eval of argument
|
|
|
|
*/
|
|
|
|
#define RELATION_IS_LOCAL(relation) \
|
2012-12-18 02:15:32 +01:00
|
|
|
((relation)->rd_islocaltemp || \
|
2004-09-16 18:58:44 +02:00
|
|
|
(relation)->rd_createSubid != InvalidSubTransactionId)
|
2004-08-28 22:31:44 +02:00
|
|
|
|
2009-04-01 00:12:48 +02:00
|
|
|
/*
|
|
|
|
* RELATION_IS_OTHER_TEMP
|
|
|
|
* Test for a temporary relation that belongs to some other session.
|
|
|
|
*
|
|
|
|
* Beware of multiple eval of argument
|
|
|
|
*/
|
|
|
|
#define RELATION_IS_OTHER_TEMP(relation) \
|
2012-12-18 02:15:32 +01:00
|
|
|
((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \
|
|
|
|
!(relation)->rd_islocaltemp)
|
2009-04-01 00:12:48 +02:00
|
|
|
|
2013-04-09 20:02:49 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationIsScannable
|
|
|
|
* Currently can only be false for a materialized view which has not been
|
|
|
|
* populated by its query. This is likely to get more complicated later,
|
|
|
|
* so use a macro which looks like a function.
|
|
|
|
*/
|
2013-05-06 19:26:51 +02:00
|
|
|
#define RelationIsScannable(relation) ((relation)->rd_rel->relispopulated)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationIsPopulated
|
|
|
|
* Currently, we don't physically distinguish the "populated" and
|
|
|
|
* "scannable" properties of matviews, but that may change later.
|
|
|
|
* Hence, use the appropriate one of these macros in code tests.
|
|
|
|
*/
|
|
|
|
#define RelationIsPopulated(relation) ((relation)->rd_rel->relispopulated)
|
2013-04-09 20:02:49 +02:00
|
|
|
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
/*
|
|
|
|
* RelationIsAccessibleInLogicalDecoding
|
|
|
|
* True if we need to log enough information to have access via
|
|
|
|
* decoding snapshot.
|
|
|
|
*/
|
|
|
|
#define RelationIsAccessibleInLogicalDecoding(relation) \
|
|
|
|
(XLogLogicalInfoActive() && \
|
|
|
|
RelationNeedsWAL(relation) && \
|
2013-12-11 01:17:34 +01:00
|
|
|
(IsCatalogRelation(relation) || RelationIsUsedAsCatalogTable(relation)))
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationIsLogicallyLogged
|
|
|
|
* True if we need to log enough information to extract the data from the
|
|
|
|
* WAL stream.
|
|
|
|
*
|
|
|
|
* We don't log information for unlogged tables (since they don't WAL log
|
Allow TRUNCATE command to truncate foreign tables.
This commit introduces new foreign data wrapper API for TRUNCATE.
It extends TRUNCATE command so that it accepts foreign tables as
the targets to truncate and invokes that API. Also it extends postgres_fdw
so that it can issue TRUNCATE command to foreign servers, by adding
new routine for that TRUNCATE API.
The information about options specified in TRUNCATE command, e.g.,
ONLY, CACADE, etc is passed to FDW via API. The list of foreign tables to
truncate is also passed to FDW. FDW truncates the foreign data sources
that the passed foreign tables specify, based on those information.
For example, postgres_fdw constructs TRUNCATE command using them
and issues it to the foreign server.
For performance, TRUNCATE command invokes the FDW routine for
TRUNCATE once per foreign server that foreign tables to truncate belong to.
Author: Kazutaka Onishi, Kohei KaiGai, slightly modified by Fujii Masao
Reviewed-by: Bharath Rupireddy, Michael Paquier, Zhihong Yu, Alvaro Herrera, Stephen Frost, Ashutosh Bapat, Amit Langote, Daniel Gustafsson, Ibrar Ahmed, Fujii Masao
Discussion: https://postgr.es/m/CAOP8fzb_gkReLput7OvOK+8NHgw-RKqNv59vem7=524krQTcWA@mail.gmail.com
Discussion: https://postgr.es/m/CAJuF6cMWDDqU-vn_knZgma+2GMaout68YUgn1uyDnexRhqqM5Q@mail.gmail.com
2021-04-08 13:56:08 +02:00
|
|
|
* anyway), for foreign tables (since they don't WAL log, either),
|
|
|
|
* and for system tables (their content is hard to make sense of, and
|
2013-12-11 01:17:34 +01:00
|
|
|
* it would complicate decoding slightly for little gain). Note that we *do*
|
|
|
|
* log information for user defined catalog tables since they presumably are
|
|
|
|
* interesting to the user...
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
*/
|
|
|
|
#define RelationIsLogicallyLogged(relation) \
|
|
|
|
(XLogLogicalInfoActive() && \
|
|
|
|
RelationNeedsWAL(relation) && \
|
Allow TRUNCATE command to truncate foreign tables.
This commit introduces new foreign data wrapper API for TRUNCATE.
It extends TRUNCATE command so that it accepts foreign tables as
the targets to truncate and invokes that API. Also it extends postgres_fdw
so that it can issue TRUNCATE command to foreign servers, by adding
new routine for that TRUNCATE API.
The information about options specified in TRUNCATE command, e.g.,
ONLY, CACADE, etc is passed to FDW via API. The list of foreign tables to
truncate is also passed to FDW. FDW truncates the foreign data sources
that the passed foreign tables specify, based on those information.
For example, postgres_fdw constructs TRUNCATE command using them
and issues it to the foreign server.
For performance, TRUNCATE command invokes the FDW routine for
TRUNCATE once per foreign server that foreign tables to truncate belong to.
Author: Kazutaka Onishi, Kohei KaiGai, slightly modified by Fujii Masao
Reviewed-by: Bharath Rupireddy, Michael Paquier, Zhihong Yu, Alvaro Herrera, Stephen Frost, Ashutosh Bapat, Amit Langote, Daniel Gustafsson, Ibrar Ahmed, Fujii Masao
Discussion: https://postgr.es/m/CAOP8fzb_gkReLput7OvOK+8NHgw-RKqNv59vem7=524krQTcWA@mail.gmail.com
Discussion: https://postgr.es/m/CAJuF6cMWDDqU-vn_knZgma+2GMaout68YUgn1uyDnexRhqqM5Q@mail.gmail.com
2021-04-08 13:56:08 +02:00
|
|
|
(relation)->rd_rel->relkind != RELKIND_FOREIGN_TABLE && \
|
Add new wal_level, logical, sufficient for logical decoding.
When wal_level=logical, we'll log columns from the old tuple as
configured by the REPLICA IDENTITY facility added in commit
07cacba983ef79be4a84fcd0e0ca3b5fcb85dd65. This makes it possible
a properly-configured logical replication solution to correctly
follow table updates even if they change the chosen key columns,
or, with REPLICA IDENTITY FULL, even if the table has no key at
all. Note that updates which do not modify the replica identity
column won't log anything extra, making the choice of a good key
(i.e. one that will rarely be changed) important to performance
when wal_level=logical is configured.
Each insert, update, or delete to a catalog table will also log
the CMIN and/or CMAX values of stamped by the current transaction.
This is necessary because logical decoding will require access to
historical snapshots of the catalog in order to decode some data
types, and the CMIN/CMAX values that we may need in order to judge
row visibility may have been overwritten by the time we need them.
Andres Freund, reviewed in various versions by myself, Heikki
Linnakangas, KONDO Mitsumasa, and many others.
2013-12-11 00:33:45 +01:00
|
|
|
!IsCatalogRelation(relation))
|
2013-04-09 20:02:49 +02:00
|
|
|
|
2004-07-17 05:32:14 +02:00
|
|
|
/* routines in utils/cache/relcache.c */
|
|
|
|
extern void RelationIncrementReferenceCount(Relation rel);
|
|
|
|
extern void RelationDecrementReferenceCount(Relation rel);
|
|
|
|
|
1996-08-28 03:59:28 +02:00
|
|
|
#endif /* REL_H */
|