postgresql/src/backend/catalog/catalog.c

476 lines
15 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* catalog.c
* routines concerned with catalog naming conventions and other
* bits of hard-wired knowledge
*
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
2010-09-20 22:08:53 +02:00
* src/backend/catalog/catalog.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <fcntl.h>
#include <unistd.h>
#include "access/genam.h"
#include "access/sysattr.h"
1999-07-16 07:00:38 +02:00
#include "access/transam.h"
#include "catalog/catalog.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_auth_members.h"
#include "catalog/pg_authid.h"
#include "catalog/pg_database.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_pltemplate.h"
#include "catalog/pg_db_role_setting.h"
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
#include "catalog/pg_replication_origin.h"
#include "catalog/pg_shdepend.h"
#include "catalog/pg_shdescription.h"
#include "catalog/pg_shseclabel.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_tablespace.h"
#include "catalog/pg_type.h"
#include "catalog/toasting.h"
1999-07-16 07:00:38 +02:00
#include "miscadmin.h"
#include "storage/fd.h"
#include "utils/fmgroids.h"
#include "utils/rel.h"
#include "utils/tqual.h"
/*
* IsSystemRelation
* True iff the relation is either a system catalog or toast table.
* By a system catalog, we mean one that created in the pg_catalog schema
* during initdb. User-created relations in pg_catalog don't count as
* system catalogs.
*
* NB: TOAST relations are considered system relations by this test
* for compatibility with the old IsSystemRelationName function.
* This is appropriate in many places but not all. Where it's not,
* also check IsToastRelation or use IsCatalogRelation().
*/
bool
IsSystemRelation(Relation relation)
{
return IsSystemClass(RelationGetRelid(relation), relation->rd_rel);
}
/*
* IsSystemClass
* Like the above, but takes a Form_pg_class as argument.
* Used when we do not want to open the relation and have to
* search pg_class directly.
*/
bool
IsSystemClass(Oid relid, Form_pg_class reltuple)
{
return IsToastClass(reltuple) || IsCatalogClass(relid, reltuple);
}
/*
* IsCatalogRelation
* True iff the relation is a system catalog, or the toast table for
* a system catalog. By a system catalog, we mean one that created
* in the pg_catalog schema during initdb. As with IsSystemRelation(),
* user-created relations in pg_catalog don't count as system catalogs.
*
* Note that IsSystemRelation() returns true for ALL toast relations,
* but this function returns true only for toast relations of system
* catalogs.
*/
bool
IsCatalogRelation(Relation relation)
{
return IsCatalogClass(RelationGetRelid(relation), relation->rd_rel);
}
/*
* IsCatalogClass
* True iff the relation is a system catalog relation.
*
* Check IsCatalogRelation() for details.
*/
bool
IsCatalogClass(Oid relid, Form_pg_class reltuple)
{
Oid relnamespace = reltuple->relnamespace;
/*
* Never consider relations outside pg_catalog/pg_toast to be catalog
* relations.
*/
if (!IsSystemNamespace(relnamespace) && !IsToastNamespace(relnamespace))
return false;
/* ----
* Check whether the oid was assigned during initdb, when creating the
* initial template database. Minus the relations in information_schema
* excluded above, these are integral part of the system.
* We could instead check whether the relation is pinned in pg_depend, but
* this is noticeably cheaper and doesn't require catalog access.
*
* This test is safe since even an oid wraparound will preserve this
* property (cf. GetNewObjectId()) and it has the advantage that it works
* correctly even if a user decides to create a relation in the pg_catalog
* namespace.
* ----
*/
return relid < FirstNormalObjectId;
}
/*
* IsToastRelation
* True iff relation is a TOAST support relation (or index).
*/
bool
IsToastRelation(Relation relation)
{
return IsToastNamespace(RelationGetNamespace(relation));
}
/*
* IsToastClass
* Like the above, but takes a Form_pg_class as argument.
* Used when we do not want to open the relation and have to
* search pg_class directly.
*/
bool
IsToastClass(Form_pg_class reltuple)
{
2002-09-04 22:31:48 +02:00
Oid relnamespace = reltuple->relnamespace;
2002-09-04 22:31:48 +02:00
return IsToastNamespace(relnamespace);
}
/*
* IsSystemNamespace
* True iff namespace is pg_catalog.
*
* NOTE: the reason this isn't a macro is to avoid having to include
* catalog/pg_namespace.h in a lot of places.
*/
bool
IsSystemNamespace(Oid namespaceId)
{
return namespaceId == PG_CATALOG_NAMESPACE;
}
/*
* IsToastNamespace
* True iff namespace is pg_toast or my temporary-toast-table namespace.
*
* Note: this will return false for temporary-toast-table namespaces belonging
* to other backends. Those are treated the same as other backends' regular
* temp table namespaces, and access is prevented where appropriate.
*/
bool
IsToastNamespace(Oid namespaceId)
{
return (namespaceId == PG_TOAST_NAMESPACE) ||
isTempToastNamespace(namespaceId);
}
/*
* IsReservedName
* True iff name starts with the pg_ prefix.
*
* For some classes of objects, the prefix pg_ is reserved for
* system objects only. As of 8.0, this was only true for
* schema and tablespace names. With 9.6, this is also true
* for roles.
*/
bool
IsReservedName(const char *name)
{
/* ugly coding for speed */
return (name[0] == 'p' &&
name[1] == 'g' &&
name[2] == '_');
}
/*
* IsSharedRelation
* Given the OID of a relation, determine whether it's supposed to be
* shared across an entire database cluster.
*
* In older releases, this had to be hard-wired so that we could compute the
* locktag for a relation and lock it before examining its catalog entry.
* Since we now have MVCC catalog access, the race conditions that made that
* a hard requirement are gone, so we could look at relaxing this restriction.
* However, if we scanned the pg_class entry to find relisshared, and only
* then locked the relation, pg_class could get updated in the meantime,
* forcing us to scan the relation again, which would definitely be complex
* and might have undesirable performance consequences. Fortunately, the set
* of shared relations is fairly static, so a hand-maintained list of their
* OIDs isn't completely impractical.
*/
bool
IsSharedRelation(Oid relationId)
{
/* These are the shared catalogs (look for BKI_SHARED_RELATION) */
if (relationId == AuthIdRelationId ||
relationId == AuthMemRelationId ||
relationId == DatabaseRelationId ||
relationId == PLTemplateRelationId ||
relationId == SharedDescriptionRelationId ||
relationId == SharedDependRelationId ||
relationId == SharedSecLabelRelationId ||
relationId == TableSpaceRelationId ||
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
relationId == DbRoleSettingRelationId ||
relationId == ReplicationOriginRelationId ||
relationId == SubscriptionRelationId)
return true;
/* These are their indexes (see indexing.h) */
if (relationId == AuthIdRolnameIndexId ||
relationId == AuthIdOidIndexId ||
relationId == AuthMemRoleMemIndexId ||
relationId == AuthMemMemRoleIndexId ||
relationId == DatabaseNameIndexId ||
relationId == DatabaseOidIndexId ||
relationId == PLTemplateNameIndexId ||
relationId == SharedDescriptionObjIndexId ||
relationId == SharedDependDependerIndexId ||
relationId == SharedDependReferenceIndexId ||
relationId == SharedSecLabelObjectIndexId ||
relationId == TablespaceOidIndexId ||
relationId == TablespaceNameIndexId ||
Introduce replication progress tracking infrastructure. When implementing a replication solution ontop of logical decoding, two related problems exist: * How to safely keep track of replication progress * How to change replication behavior, based on the origin of a row; e.g. to avoid loops in bi-directional replication setups The solution to these problems, as implemented here, consist out of three parts: 1) 'replication origins', which identify nodes in a replication setup. 2) 'replication progress tracking', which remembers, for each replication origin, how far replay has progressed in a efficient and crash safe manner. 3) The ability to filter out changes performed on the behest of a replication origin during logical decoding; this allows complex replication topologies. E.g. by filtering all replayed changes out. Most of this could also be implemented in "userspace", e.g. by inserting additional rows contain origin information, but that ends up being much less efficient and more complicated. We don't want to require various replication solutions to reimplement logic for this independently. The infrastructure is intended to be generic enough to be reusable. This infrastructure also replaces the 'nodeid' infrastructure of commit timestamps. It is intended to provide all the former capabilities, except that there's only 2^16 different origins; but now they integrate with logical decoding. Additionally more functionality is accessible via SQL. Since the commit timestamp infrastructure has also been introduced in 9.5 (commit 73c986add) changing the API is not a problem. For now the number of origins for which the replication progress can be tracked simultaneously is determined by the max_replication_slots GUC. That GUC is not a perfect match to configure this, but there doesn't seem to be sufficient reason to introduce a separate new one. Bumps both catversion and wal page magic. Author: Andres Freund, with contributions from Petr Jelinek and Craig Ringer Reviewed-By: Heikki Linnakangas, Petr Jelinek, Robert Haas, Steve Singer Discussion: 20150216002155.GI15326@awork2.anarazel.de, 20140923182422.GA15776@alap3.anarazel.de, 20131114172632.GE7522@alap2.anarazel.de
2015-04-29 19:30:53 +02:00
relationId == DbRoleSettingDatidRolidIndexId ||
relationId == ReplicationOriginIdentIndex ||
relationId == ReplicationOriginNameIndex ||
relationId == SubscriptionObjectIndexId ||
relationId == SubscriptionNameIndexId)
return true;
/* These are their toast tables and toast indexes (see toasting.h) */
if (relationId == PgShdescriptionToastTable ||
relationId == PgShdescriptionToastIndex ||
relationId == PgDbRoleSettingToastTable ||
relationId == PgDbRoleSettingToastIndex ||
relationId == PgShseclabelToastTable ||
relationId == PgShseclabelToastIndex)
return true;
return false;
}
/*
* GetNewOid
* Generate a new OID that is unique within the given relation.
*
* Caller must have a suitable lock on the relation.
*
* Uniqueness is promised only if the relation has a unique index on OID.
* This is true for all system catalogs that have OIDs, but might not be
* true for user tables. Note that we are effectively assuming that the
* table has a relatively small number of entries (much less than 2^32)
* and there aren't very long runs of consecutive existing OIDs. Again,
* this is reasonable for system catalogs but less so for user tables.
*
* Since the OID is not immediately inserted into the table, there is a
* race condition here; but a problem could occur only if someone else
* managed to cycle through 2^32 OIDs and generate the same OID before we
* finish inserting our row. This seems unlikely to be a problem. Note
* that if we had to *commit* the row to end the race condition, the risk
* would be rather higher; therefore we use SnapshotDirty in the test,
* so that we will see uncommitted rows.
*/
Oid
GetNewOid(Relation relation)
{
Oid oidIndex;
/* If relation doesn't have OIDs at all, caller is confused */
Assert(relation->rd_rel->relhasoids);
/* In bootstrap mode, we don't have any indexes to use */
if (IsBootstrapProcessingMode())
return GetNewObjectId();
/* The relcache will cache the identity of the OID index for us */
oidIndex = RelationGetOidIndex(relation);
/* If no OID index, just hand back the next OID counter value */
if (!OidIsValid(oidIndex))
{
/*
2005-10-15 04:49:52 +02:00
* System catalogs that have OIDs should *always* have a unique OID
* index; we should only take this path for user tables. Give a
* warning if it looks like somebody forgot an index.
*/
if (IsSystemRelation(relation))
elog(WARNING, "generating possibly-non-unique OID for \"%s\"",
RelationGetRelationName(relation));
return GetNewObjectId();
}
/* Otherwise, use the index to find a nonconflicting OID */
return GetNewOidWithIndex(relation, oidIndex, ObjectIdAttributeNumber);
}
/*
* GetNewOidWithIndex
* Guts of GetNewOid: use the supplied index
*
* This is exported separately because there are cases where we want to use
* an index that will not be recognized by RelationGetOidIndex: TOAST tables
* have indexes that are usable, but have multiple columns and are on
* ordinary columns rather than a true OID column. This code will work
* anyway, so long as the OID is the index's first column. The caller must
* pass in the actual heap attnum of the OID column, however.
*
* Caller must have a suitable lock on the relation.
*/
Oid
GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn)
{
Oid newOid;
SnapshotData SnapshotDirty;
SysScanDesc scan;
ScanKeyData key;
bool collides;
/*
* We should never be asked to generate a new pg_type OID during
* pg_upgrade; doing so would risk collisions with the OIDs it wants to
* assign. Hitting this assert means there's some path where we failed to
* ensure that a type OID is determined by commands in the dump script.
*/
Assert(!IsBinaryUpgrade || RelationGetRelid(relation) != TypeRelationId);
InitDirtySnapshot(SnapshotDirty);
/* Generate new OIDs until we find one not in the table */
do
{
CHECK_FOR_INTERRUPTS();
newOid = GetNewObjectId();
ScanKeyInit(&key,
oidcolumn,
BTEqualStrategyNumber, F_OIDEQ,
ObjectIdGetDatum(newOid));
/* see notes above about using SnapshotDirty */
scan = systable_beginscan(relation, indexId, true,
&SnapshotDirty, 1, &key);
collides = HeapTupleIsValid(systable_getnext(scan));
systable_endscan(scan);
} while (collides);
return newOid;
}
/*
* GetNewRelFileNode
* Generate a new relfilenode number that is unique within the
* database of the given tablespace.
*
* If the relfilenode will also be used as the relation's OID, pass the
* opened pg_class catalog, and this routine will guarantee that the result
* is also an unused OID within pg_class. If the result is to be used only
* as a relfilenode for an existing relation, pass NULL for pg_class.
*
* As with GetNewOid, there is some theoretical risk of a race condition,
* but it doesn't seem worth worrying about.
*
* Note: we don't support using this in bootstrap mode. All relations
* created by bootstrap have preassigned OIDs, so there's no need.
*/
Oid
GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence)
{
RelFileNodeBackend rnode;
char *rpath;
int fd;
bool collides;
BackendId backend;
/*
* If we ever get here during pg_upgrade, there's something wrong; all
* relfilenode assignments during a binary-upgrade run should be
* determined by commands in the dump script.
*/
Assert(!IsBinaryUpgrade);
switch (relpersistence)
{
case RELPERSISTENCE_TEMP:
Improve the situation for parallel query versus temp relations. Transmit the leader's temp-namespace state to workers. This is important because without it, the workers do not really have the same search path as the leader. For example, there is no good reason (and no extant code either) to prevent a worker from executing a temp function that the leader created previously; but as things stood it would fail to find the temp function, and then either fail or execute the wrong function entirely. We still prohibit a worker from creating a temp namespace on its own. In effect, a worker can only see the session's temp namespace if the leader had created it before starting the worker, which seems like the right semantics. Also, transmit the leader's BackendId to workers, and arrange for workers to use that when determining the physical file path of a temp relation belonging to their session. While the original intent was to prevent such accesses entirely, there were a number of holes in that, notably in places like dbsize.c which assume they can safely access temp rels of other sessions anyway. We might as well get this right, as a small down payment on someday allowing workers to access the leader's temp tables. (With this change, directly using "MyBackendId" as a relation or buffer backend ID is deprecated; you should use BackendIdForTempRelations() instead. I left a couple of such uses alone though, as they're not going to be reachable in parallel workers until we do something about localbuf.c.) Move the thou-shalt-not-access-thy-leader's-temp-tables prohibition down into localbuf.c, which is where it actually matters, instead of having it in relation_open(). This amounts to recognizing that access to temp tables' catalog entries is perfectly safe in a worker, it's only the data in local buffers that is problematic. Having done all that, we can get rid of the test in has_parallel_hazard() that says that use of a temp table's rowtype is unsafe in parallel workers. That test was unduly expensive, and if we really did need such a prohibition, that was not even close to being a bulletproof guard for it. (For example, any user-defined function executed in a parallel worker might have attempted such access.)
2016-06-10 02:16:11 +02:00
backend = BackendIdForTempRelations();
break;
case RELPERSISTENCE_UNLOGGED:
case RELPERSISTENCE_PERMANENT:
backend = InvalidBackendId;
break;
default:
elog(ERROR, "invalid relpersistence: %c", relpersistence);
return InvalidOid; /* placate compiler */
}
/* This logic should match RelationInitPhysicalAddr */
rnode.node.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
rnode.node.dbNode = (rnode.node.spcNode == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId;
/*
* The relpath will vary based on the backend ID, so we must initialize
* that properly here to make sure that any collisions based on filename
* are properly detected.
*/
rnode.backend = backend;
do
{
CHECK_FOR_INTERRUPTS();
/* Generate the OID */
if (pg_class)
rnode.node.relNode = GetNewOid(pg_class);
else
rnode.node.relNode = GetNewObjectId();
/* Check for existing file of same name */
rpath = relpath(rnode, MAIN_FORKNUM);
fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY);
if (fd >= 0)
{
/* definite collision */
close(fd);
collides = true;
}
else
{
/*
* Here we have a little bit of a dilemma: if errno is something
2005-10-15 04:49:52 +02:00
* other than ENOENT, should we declare a collision and loop? In
* particular one might think this advisable for, say, EPERM.
* However there really shouldn't be any unreadable files in a
* tablespace directory, and if the EPERM is actually complaining
* that we can't read the directory itself, we'd be in an infinite
* loop. In practice it seems best to go ahead regardless of the
2005-10-15 04:49:52 +02:00
* errno. If there is a colliding file we will get an smgr
* failure when we attempt to create the new relation file.
*/
collides = false;
}
pfree(rpath);
} while (collides);
return rnode.node.relNode;
}