postgresql/src/backend/catalog/catalog.c

455 lines
14 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* catalog.c
* routines concerned with catalog naming conventions and other
* bits of hard-wired knowledge
*
*
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/catalog/catalog.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <fcntl.h>
#include <unistd.h>
#include "access/genam.h"
#include "access/sysattr.h"
1999-07-16 07:00:38 +02:00
#include "access/transam.h"
#include "catalog/catalog.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_auth_members.h"
#include "catalog/pg_authid.h"
#include "catalog/pg_database.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_pltemplate.h"
#include "catalog/pg_db_role_setting.h"
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
#include "catalog/pg_replication_origin.h"
#include "catalog/pg_shdepend.h"
#include "catalog/pg_shdescription.h"
#include "catalog/pg_shseclabel.h"
#include "catalog/pg_tablespace.h"
#include "catalog/toasting.h"
1999-07-16 07:00:38 +02:00
#include "miscadmin.h"
#include "storage/fd.h"
#include "utils/fmgroids.h"
#include "utils/rel.h"
#include "utils/tqual.h"
/*
* IsSystemRelation
* True iff the relation is either a system catalog or toast table.
* By a system catalog, we mean one that created in the pg_catalog schema
* during initdb. User-created relations in pg_catalog don't count as
* system catalogs.
*
* NB: TOAST relations are considered system relations by this test
* for compatibility with the old IsSystemRelationName function.
* This is appropriate in many places but not all. Where it's not,
* also check IsToastRelation or use IsCatalogRelation().
*/
bool
IsSystemRelation(Relation relation)
{
return IsSystemClass(RelationGetRelid(relation), relation->rd_rel);
}
/*
* IsSystemClass
* Like the above, but takes a Form_pg_class as argument.
* Used when we do not want to open the relation and have to
* search pg_class directly.
*/
bool
IsSystemClass(Oid relid, Form_pg_class reltuple)
{
return IsToastClass(reltuple) || IsCatalogClass(relid, reltuple);
}
/*
* IsCatalogRelation
* True iff the relation is a system catalog, or the toast table for
* a system catalog. By a system catalog, we mean one that created
* in the pg_catalog schema during initdb. As with IsSystemRelation(),
* user-created relations in pg_catalog don't count as system catalogs.
*
* Note that IsSystemRelation() returns true for ALL toast relations,
* but this function returns true only for toast relations of system
* catalogs.
*/
bool
IsCatalogRelation(Relation relation)
{
return IsCatalogClass(RelationGetRelid(relation), relation->rd_rel);
}
/*
* IsCatalogClass
* True iff the relation is a system catalog relation.
*
* Check IsCatalogRelation() for details.
*/
bool
IsCatalogClass(Oid relid, Form_pg_class reltuple)
{
Oid relnamespace = reltuple->relnamespace;
/*
* Never consider relations outside pg_catalog/pg_toast to be catalog
* relations.
*/
if (!IsSystemNamespace(relnamespace) && !IsToastNamespace(relnamespace))
return false;
/* ----
* Check whether the oid was assigned during initdb, when creating the
* initial template database. Minus the relations in information_schema
* excluded above, these are integral part of the system.
* We could instead check whether the relation is pinned in pg_depend, but
* this is noticeably cheaper and doesn't require catalog access.
*
* This test is safe since even an oid wraparound will preserve this
* property (c.f. GetNewObjectId()) and it has the advantage that it works
* correctly even if a user decides to create a relation in the pg_catalog
* namespace.
* ----
*/
return relid < FirstNormalObjectId;
}
/*
* IsToastRelation
* True iff relation is a TOAST support relation (or index).
*/
bool
IsToastRelation(Relation relation)
{
return IsToastNamespace(RelationGetNamespace(relation));
}
/*
* IsToastClass
* Like the above, but takes a Form_pg_class as argument.
* Used when we do not want to open the relation and have to
* search pg_class directly.
*/
bool
IsToastClass(Form_pg_class reltuple)
{
2002-09-04 22:31:48 +02:00
Oid relnamespace = reltuple->relnamespace;
2002-09-04 22:31:48 +02:00
return IsToastNamespace(relnamespace);
}
/*
* IsSystemNamespace
* True iff namespace is pg_catalog.
*
* NOTE: the reason this isn't a macro is to avoid having to include
* catalog/pg_namespace.h in a lot of places.
*/
bool
IsSystemNamespace(Oid namespaceId)
{
return namespaceId == PG_CATALOG_NAMESPACE;
}
/*
* IsToastNamespace
* True iff namespace is pg_toast or my temporary-toast-table namespace.
*
* Note: this will return false for temporary-toast-table namespaces belonging
* to other backends. Those are treated the same as other backends' regular
* temp table namespaces, and access is prevented where appropriate.
*/
bool
IsToastNamespace(Oid namespaceId)
{
return (namespaceId == PG_TOAST_NAMESPACE) ||
isTempToastNamespace(namespaceId);
}
/*
* IsReservedName
* True iff name starts with the pg_ prefix.
*
* For some classes of objects, the prefix pg_ is reserved for
* system objects only. As of 8.0, this is only true for
* schema and tablespace names.
*/
bool
IsReservedName(const char *name)
{
/* ugly coding for speed */
return (name[0] == 'p' &&
name[1] == 'g' &&
name[2] == '_');
}
/*
* IsSharedRelation
* Given the OID of a relation, determine whether it's supposed to be
* shared across an entire database cluster.
*
* In older releases, this had to be hard-wired so that we could compute the
* locktag for a relation and lock it before examining its catalog entry.
* Since we now have MVCC catalog access, the race conditions that made that
* a hard requirement are gone, so we could look at relaxing this restriction.
* However, if we scanned the pg_class entry to find relisshared, and only
* then locked the relation, pg_class could get updated in the meantime,
* forcing us to scan the relation again, which would definitely be complex
* and might have undesirable performance consequences. Fortunately, the set
* of shared relations is fairly static, so a hand-maintained list of their
* OIDs isn't completely impractical.
*/
bool
IsSharedRelation(Oid relationId)
{
/* These are the shared catalogs (look for BKI_SHARED_RELATION) */
if (relationId == AuthIdRelationId ||
relationId == AuthMemRelationId ||
relationId == DatabaseRelationId ||
relationId == PLTemplateRelationId ||
relationId == SharedDescriptionRelationId ||
relationId == SharedDependRelationId ||
relationId == SharedSecLabelRelationId ||
relationId == TableSpaceRelationId ||
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
relationId == DbRoleSettingRelationId ||
relationId == ReplicationOriginRelationId)
return true;
/* These are their indexes (see indexing.h) */
if (relationId == AuthIdRolnameIndexId ||
relationId == AuthIdOidIndexId ||
relationId == AuthMemRoleMemIndexId ||
relationId == AuthMemMemRoleIndexId ||
relationId == DatabaseNameIndexId ||
relationId == DatabaseOidIndexId ||
relationId == PLTemplateNameIndexId ||
relationId == SharedDescriptionObjIndexId ||
relationId == SharedDependDependerIndexId ||
relationId == SharedDependReferenceIndexId ||
relationId == SharedSecLabelObjectIndexId ||
relationId == TablespaceOidIndexId ||
relationId == TablespaceNameIndexId ||
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
relationId == DbRoleSettingDatidRolidIndexId ||
relationId == ReplicationOriginIdentIndex ||
relationId == ReplicationOriginNameIndex)
return true;
/* These are their toast tables and toast indexes (see toasting.h) */
if (relationId == PgShdescriptionToastTable ||
relationId == PgShdescriptionToastIndex ||
relationId == PgDbRoleSettingToastTable ||
relationId == PgDbRoleSettingToastIndex ||
relationId == PgShseclabelToastTable ||
relationId == PgShseclabelToastIndex)
return true;
return false;
}
/*
* GetNewOid
* Generate a new OID that is unique within the given relation.
*
* Caller must have a suitable lock on the relation.
*
* Uniqueness is promised only if the relation has a unique index on OID.
* This is true for all system catalogs that have OIDs, but might not be
* true for user tables. Note that we are effectively assuming that the
* table has a relatively small number of entries (much less than 2^32)
* and there aren't very long runs of consecutive existing OIDs. Again,
* this is reasonable for system catalogs but less so for user tables.
*
* Since the OID is not immediately inserted into the table, there is a
* race condition here; but a problem could occur only if someone else
* managed to cycle through 2^32 OIDs and generate the same OID before we
* finish inserting our row. This seems unlikely to be a problem. Note
* that if we had to *commit* the row to end the race condition, the risk
* would be rather higher; therefore we use SnapshotDirty in the test,
* so that we will see uncommitted rows.
*/
Oid
GetNewOid(Relation relation)
{
Oid oidIndex;
/* If relation doesn't have OIDs at all, caller is confused */
Assert(relation->rd_rel->relhasoids);
/* In bootstrap mode, we don't have any indexes to use */
if (IsBootstrapProcessingMode())
return GetNewObjectId();
/* The relcache will cache the identity of the OID index for us */
oidIndex = RelationGetOidIndex(relation);
/* If no OID index, just hand back the next OID counter value */
if (!OidIsValid(oidIndex))
{
/*
2005-10-15 04:49:52 +02:00
* System catalogs that have OIDs should *always* have a unique OID
* index; we should only take this path for user tables. Give a
* warning if it looks like somebody forgot an index.
*/
if (IsSystemRelation(relation))
elog(WARNING, "generating possibly-non-unique OID for \"%s\"",
RelationGetRelationName(relation));
return GetNewObjectId();
}
/* Otherwise, use the index to find a nonconflicting OID */
return GetNewOidWithIndex(relation, oidIndex, ObjectIdAttributeNumber);
}
/*
* GetNewOidWithIndex
* Guts of GetNewOid: use the supplied index
*
* This is exported separately because there are cases where we want to use
* an index that will not be recognized by RelationGetOidIndex: TOAST tables
* have indexes that are usable, but have multiple columns and are on
* ordinary columns rather than a true OID column. This code will work
* anyway, so long as the OID is the index's first column. The caller must
* pass in the actual heap attnum of the OID column, however.
*
* Caller must have a suitable lock on the relation.
*/
Oid
GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn)
{
Oid newOid;
SnapshotData SnapshotDirty;
SysScanDesc scan;
ScanKeyData key;
bool collides;
InitDirtySnapshot(SnapshotDirty);
/* Generate new OIDs until we find one not in the table */
do
{
CHECK_FOR_INTERRUPTS();
newOid = GetNewObjectId();
ScanKeyInit(&key,
oidcolumn,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(newOid));
/* see notes above about using SnapshotDirty */
scan = systable_beginscan(relation, indexId, true,
&SnapshotDirty, 1, &key);
collides = HeapTupleIsValid(systable_getnext(scan));
systable_endscan(scan);
} while (collides);
return newOid;
}
/*
* GetNewRelFileNode
* Generate a new relfilenode number that is unique within the
* database of the given tablespace.
*
* If the relfilenode will also be used as the relation's OID, pass the
* opened pg_class catalog, and this routine will guarantee that the result
* is also an unused OID within pg_class. If the result is to be used only
* as a relfilenode for an existing relation, pass NULL for pg_class.
*
* As with GetNewOid, there is some theoretical risk of a race condition,
* but it doesn't seem worth worrying about.
*
* Note: we don't support using this in bootstrap mode. All relations
* created by bootstrap have preassigned OIDs, so there's no need.
*/
Oid
GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence)
{
RelFileNodeBackend rnode;
char *rpath;
int fd;
bool collides;
BackendId backend;
switch (relpersistence)
{
case RELPERSISTENCE_TEMP:
backend = MyBackendId;
break;
case RELPERSISTENCE_UNLOGGED:
case RELPERSISTENCE_PERMANENT:
backend = InvalidBackendId;
break;
default:
elog(ERROR, "invalid relpersistence: %c", relpersistence);
return InvalidOid; /* placate compiler */
}
/* This logic should match RelationInitPhysicalAddr */
rnode.node.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
rnode.node.dbNode = (rnode.node.spcNode == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId;
/*
* The relpath will vary based on the backend ID, so we must initialize
* that properly here to make sure that any collisions based on filename
* are properly detected.
*/
rnode.backend = backend;
do
{
CHECK_FOR_INTERRUPTS();
/* Generate the OID */
if (pg_class)
rnode.node.relNode = GetNewOid(pg_class);
else
rnode.node.relNode = GetNewObjectId();
/* Check for existing file of same name */
rpath = relpath(rnode, MAIN_FORKNUM);
fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);
if (fd >= 0)
{
/* definite collision */
close(fd);
collides = true;
}
else
{
/*
* Here we have a little bit of a dilemma: if errno is something
2005-10-15 04:49:52 +02:00
* other than ENOENT, should we declare a collision and loop? In
* particular one might think this advisable for, say, EPERM.
* However there really shouldn't be any unreadable files in a
* tablespace directory, and if the EPERM is actually complaining
* that we can't read the directory itself, we'd be in an infinite
* loop. In practice it seems best to go ahead regardless of the
2005-10-15 04:49:52 +02:00
* errno. If there is a colliding file we will get an smgr
* failure when we attempt to create the new relation file.
*/
collides = false;
}
pfree(rpath);
} while (collides);
return rnode.node.relNode;
}