Tom Lane 077db40fa1 ALTER TABLE rewrite. New cool stuff:
* ALTER ... ADD COLUMN with defaults and NOT NULL constraints works per SQL
spec.  A default is implemented by rewriting the table with the new value
stored in each row.

* ALTER COLUMN TYPE.  You can change a column's datatype to anything you
want, so long as you can specify how to convert the old value.  Rewrites
the table.  (Possible future improvement: optimize no-op conversions such
as varchar(N) to varchar(N+1).)

* Multiple ALTER actions in a single ALTER TABLE command.  You can perform
any number of column additions, type changes, and constraint additions with
only one pass over the table contents.

Basic documentation provided in ALTER TABLE ref page, but some more docs
work is needed.

Original patch from Rod Taylor, additional work from Tom Lane.
2004-05-05 04:48:48 +00:00

1818 lines
51 KiB

* index.c
* code to create and destroy POSTGRES index relations
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* $PostgreSQL: pgsql/src/backend/catalog/index.c,v 1.229 2004/05/05 04:48:45 tgl Exp $
* index_create() - Create a cataloged index relation
* index_drop() - Removes index relation from catalogs
* BuildIndexInfo() - Prepare to insert index tuples
* FormIndexDatum() - Construct datum vector for one index tuple
#include "postgres.h"
#include <unistd.h>
#include "access/genam.h"
#include "access/heapam.h"
#include "bootstrap/bootstrap.h"
#include "catalog/catalog.h"
#include "catalog/catname.h"
#include "catalog/dependency.h"
#include "catalog/heap.h"
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/pg_constraint.h"
#include "catalog/pg_index.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
#include "executor/executor.h"
#include "miscadmin.h"
#include "optimizer/clauses.h"
#include "optimizer/prep.h"
#include "parser/parse_expr.h"
#include "parser/parse_func.h"
#include "storage/sinval.h"
#include "storage/smgr.h"
#include "utils/builtins.h"
#include "utils/catcache.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/relcache.h"
#include "utils/syscache.h"
* macros used in guessing how many tuples are on a page.
#define AVG_ATTR_SIZE 8
#define NTUPLES_PER_PAGE(natts) \
((BLCKSZ - MAXALIGN(sizeof(PageHeaderData))) / \
((natts) * AVG_ATTR_SIZE + MAXALIGN(sizeof(HeapTupleHeaderData))))
/* non-export function prototypes */
static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
IndexInfo *indexInfo,
Oid *classObjectId);
static void UpdateRelationRelation(Relation indexRelation);
static void InitializeAttributeOids(Relation indexRelation,
int numatts, Oid indexoid);
static void AppendAttributeTuples(Relation indexRelation, int numatts);
static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
IndexInfo *indexInfo,
Oid *classOids,
bool primary);
static Oid IndexGetRelation(Oid indexId);
* ConstructTupleDescriptor
* Build an index tuple descriptor for a new index
static TupleDesc
ConstructTupleDescriptor(Relation heapRelation,
IndexInfo *indexInfo,
Oid *classObjectId)
int numatts = indexInfo->ii_NumIndexAttrs;
List *indexprs = indexInfo->ii_Expressions;
TupleDesc heapTupDesc;
TupleDesc indexTupDesc;
int natts; /* #atts in heap rel --- for error checks */
int i;
heapTupDesc = RelationGetDescr(heapRelation);
natts = RelationGetForm(heapRelation)->relnatts;
* allocate the new tuple descriptor
indexTupDesc = CreateTemplateTupleDesc(numatts, false);
* For simple index columns, we copy the pg_attribute row from the
* parent relation and modify it as necessary. For expressions we
* have to cons up a pg_attribute row the hard way.
for (i = 0; i < numatts; i++)
AttrNumber atnum = indexInfo->ii_KeyAttrNumbers[i];
Form_pg_attribute to;
HeapTuple tuple;
Form_pg_type typeTup;
Oid keyType;
indexTupDesc->attrs[i] = to =
(Form_pg_attribute) palloc0(ATTRIBUTE_TUPLE_SIZE);
if (atnum != 0)
/* Simple index column */
Form_pg_attribute from;
if (atnum < 0)
* here we are indexing on a system attribute (-1...-n)
from = SystemAttributeDefinition(atnum,
* here we are indexing on a normal attribute (1...n)
if (atnum > natts) /* safety check */
elog(ERROR, "invalid column number %d", atnum);
from = heapTupDesc->attrs[AttrNumberGetAttrOffset(atnum)];
* now that we've determined the "from", let's copy the tuple
* desc data...
memcpy(to, from, ATTRIBUTE_TUPLE_SIZE);
* Fix the stuff that should not be the same as the underlying
* attr
to->attnum = i + 1;
to->attstattarget = -1;
to->attcacheoff = -1;
to->attnotnull = false;
to->atthasdef = false;
to->attislocal = true;
to->attinhcount = 0;
/* Expressional index */
Node *indexkey;
if (indexprs == NIL) /* shouldn't happen */
elog(ERROR, "too few entries in indexprs list");
indexkey = (Node *) lfirst(indexprs);
indexprs = lnext(indexprs);
* Make the attribute's name "pg_expresssion_nnn" (maybe think
* of something better later)
sprintf(NameStr(to->attname), "pg_expression_%d", i + 1);
* Lookup the expression type in pg_type for the type length
* etc.
keyType = exprType(indexkey);
tuple = SearchSysCache(TYPEOID,
0, 0, 0);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for type %u", keyType);
typeTup = (Form_pg_type) GETSTRUCT(tuple);
* Assign some of the attributes values. Leave the rest as 0.
to->attnum = i + 1;
to->atttypid = keyType;
to->attlen = typeTup->typlen;
to->attbyval = typeTup->typbyval;
to->attstorage = typeTup->typstorage;
to->attalign = typeTup->typalign;
to->attstattarget = -1;
to->attcacheoff = -1;
to->atttypmod = -1;
to->attislocal = true;
* We do not yet have the correct relation OID for the index, so
* just set it invalid for now. InitializeAttributeOids() will
* fix it later.
to->attrelid = InvalidOid;
* Check the opclass to see if it provides a keytype (overriding
* the attribute type).
tuple = SearchSysCache(CLAOID,
0, 0, 0);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for opclass %u",
keyType = ((Form_pg_opclass) GETSTRUCT(tuple))->opckeytype;
if (OidIsValid(keyType) && keyType != to->atttypid)
/* index value and heap value have different types */
tuple = SearchSysCache(TYPEOID,
0, 0, 0);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for type %u", keyType);
typeTup = (Form_pg_type) GETSTRUCT(tuple);
to->atttypid = keyType;
to->atttypmod = -1;
to->attlen = typeTup->typlen;
to->attbyval = typeTup->typbyval;
to->attalign = typeTup->typalign;
to->attstorage = typeTup->typstorage;
return indexTupDesc;
/* ----------------------------------------------------------------
* UpdateRelationRelation
* ----------------------------------------------------------------
static void
UpdateRelationRelation(Relation indexRelation)
Relation pg_class;
HeapTuple tuple;
pg_class = heap_openr(RelationRelationName, RowExclusiveLock);
/* XXX Natts_pg_class_fixed is a hack - see pg_class.h */
tuple = heap_addheader(Natts_pg_class_fixed,
(void *) indexRelation->rd_rel);
* the new tuple must have the oid already chosen for the index. sure
* would be embarrassing to do this sort of thing in polite company.
HeapTupleSetOid(tuple, RelationGetRelid(indexRelation));
simple_heap_insert(pg_class, tuple);
/* update the system catalog indexes */
CatalogUpdateIndexes(pg_class, tuple);
heap_close(pg_class, RowExclusiveLock);
/* ----------------------------------------------------------------
* InitializeAttributeOids
* ----------------------------------------------------------------
static void
InitializeAttributeOids(Relation indexRelation,
int numatts,
Oid indexoid)
TupleDesc tupleDescriptor;
int i;
tupleDescriptor = RelationGetDescr(indexRelation);
for (i = 0; i < numatts; i += 1)
tupleDescriptor->attrs[i]->attrelid = indexoid;
/* ----------------------------------------------------------------
* AppendAttributeTuples
* ----------------------------------------------------------------
static void
AppendAttributeTuples(Relation indexRelation, int numatts)
Relation pg_attribute;
CatalogIndexState indstate;
TupleDesc indexTupDesc;
HeapTuple new_tuple;
int i;
* open the attribute relation and its indexes
pg_attribute = heap_openr(AttributeRelationName, RowExclusiveLock);
indstate = CatalogOpenIndexes(pg_attribute);
* insert data from new index's tupdesc into pg_attribute
indexTupDesc = RelationGetDescr(indexRelation);
for (i = 0; i < numatts; i++)
* There used to be very grotty code here to set these fields, but
* I think it's unnecessary. They should be set already.
Assert(indexTupDesc->attrs[i]->attnum == i + 1);
Assert(indexTupDesc->attrs[i]->attcacheoff == -1);
new_tuple = heap_addheader(Natts_pg_attribute,
(void *) indexTupDesc->attrs[i]);
simple_heap_insert(pg_attribute, new_tuple);
CatalogIndexInsert(indstate, new_tuple);
heap_close(pg_attribute, RowExclusiveLock);
/* ----------------------------------------------------------------
* UpdateIndexRelation
* ----------------------------------------------------------------
static void
UpdateIndexRelation(Oid indexoid,
Oid heapoid,
IndexInfo *indexInfo,
Oid *classOids,
bool primary)
int16 indkey[INDEX_MAX_KEYS];
Oid indclass[INDEX_MAX_KEYS];
Datum exprsDatum;
Datum predDatum;
Datum values[Natts_pg_index];
char nulls[Natts_pg_index];
Relation pg_index;
HeapTuple tuple;
int i;
* Copy the index key and opclass info into zero-filled vectors
MemSet(indkey, 0, sizeof(indkey));
MemSet(indclass, 0, sizeof(indclass));
for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
indkey[i] = indexInfo->ii_KeyAttrNumbers[i];
indclass[i] = classOids[i];
* Convert the index expressions (if any) to a text datum
if (indexInfo->ii_Expressions != NIL)
char *exprsString;
exprsString = nodeToString(indexInfo->ii_Expressions);
exprsDatum = DirectFunctionCall1(textin,
exprsDatum = (Datum) 0;
* Convert the index predicate (if any) to a text datum. Note we
* convert implicit-AND format to normal explicit-AND for storage.
if (indexInfo->ii_Predicate != NIL)
char *predString;
predString = nodeToString(make_ands_explicit(indexInfo->ii_Predicate));
predDatum = DirectFunctionCall1(textin,
predDatum = (Datum) 0;
* open the system catalog index relation
pg_index = heap_openr(IndexRelationName, RowExclusiveLock);
* Build a pg_index tuple
MemSet(nulls, ' ', sizeof(nulls));
values[Anum_pg_index_indexrelid - 1] = ObjectIdGetDatum(indexoid);
values[Anum_pg_index_indrelid - 1] = ObjectIdGetDatum(heapoid);
values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
values[Anum_pg_index_indclass - 1] = PointerGetDatum(indclass);
values[Anum_pg_index_indnatts - 1] = Int16GetDatum(indexInfo->ii_NumIndexAttrs);
values[Anum_pg_index_indisunique - 1] = BoolGetDatum(indexInfo->ii_Unique);
values[Anum_pg_index_indisprimary - 1] = BoolGetDatum(primary);
values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
values[Anum_pg_index_indexprs - 1] = exprsDatum;
if (exprsDatum == (Datum) 0)
nulls[Anum_pg_index_indexprs - 1] = 'n';
values[Anum_pg_index_indpred - 1] = predDatum;
if (predDatum == (Datum) 0)
nulls[Anum_pg_index_indpred - 1] = 'n';
tuple = heap_formtuple(RelationGetDescr(pg_index), values, nulls);
* insert the tuple into the pg_index catalog
simple_heap_insert(pg_index, tuple);
/* update the indexes on pg_index */
CatalogUpdateIndexes(pg_index, tuple);
* close the relation and free the tuple
heap_close(pg_index, RowExclusiveLock);
/* ----------------------------------------------------------------
* index_create
* Returns OID of the created index.
* ----------------------------------------------------------------
index_create(Oid heapRelationId,
const char *indexRelationName,
IndexInfo *indexInfo,
Oid accessMethodObjectId,
Oid *classObjectId,
bool primary,
bool isconstraint,
bool allow_system_table_mods,
bool skip_build)
Relation heapRelation;
Relation indexRelation;
TupleDesc indexTupDesc;
bool shared_relation;
Oid namespaceId;
Oid indexoid;
int i;
* Only SELECT ... FOR UPDATE are allowed while doing this
heapRelation = heap_open(heapRelationId, ShareLock);
* The index will be in the same namespace as its parent table, and is
* shared across databases if and only if the parent is.
namespaceId = RelationGetNamespace(heapRelation);
shared_relation = heapRelation->rd_rel->relisshared;
* check parameters
if (indexInfo->ii_NumIndexAttrs < 1)
elog(ERROR, "must index at least one column");
if (!allow_system_table_mods &&
IsSystemRelation(heapRelation) &&
errmsg("user-defined indexes on system catalog tables are not supported")));
* We cannot allow indexing a shared relation after initdb (because
* there's no way to make the entry in other databases' pg_class).
* Unfortunately we can't distinguish initdb from a manually started
* standalone backend (toasting of shared rels happens after the bootstrap
* phase, so checking IsBootstrapProcessingMode() won't work). However,
* we can at least prevent this mistake under normal multi-user operation.
if (shared_relation && IsUnderPostmaster)
errmsg("shared indexes cannot be created after initdb")));
if (get_relname_relid(indexRelationName, namespaceId))
errmsg("relation \"%s\" already exists",
* construct tuple descriptor for index tuples
indexTupDesc = ConstructTupleDescriptor(heapRelation,
* create the index relation's relcache entry and physical disk file.
* (If we fail further down, it's the smgr's responsibility to remove
* the disk file again.)
indexRelation = heap_create(indexRelationName,
/* Fetch the relation OID assigned by heap_create */
indexoid = RelationGetRelid(indexRelation);
* Obtain exclusive lock on it. Although no other backends can see it
* until we commit, this prevents deadlock-risk complaints from lock
* manager in cases such as CLUSTER.
LockRelation(indexRelation, AccessExclusiveLock);
* Fill in fields of the index's pg_class entry that are not set
* correctly by heap_create.
* XXX should have a cleaner way to create cataloged indexes
indexRelation->rd_rel->relowner = GetUserId();
indexRelation->rd_rel->relam = accessMethodObjectId;
indexRelation->rd_rel->relkind = RELKIND_INDEX;
indexRelation->rd_rel->relhasoids = false;
* store index's pg_class entry
* now update the object id's of all the attribute tuple forms in the
* index relation's tuple descriptor
* append ATTRIBUTE tuples for the index
AppendAttributeTuples(indexRelation, indexInfo->ii_NumIndexAttrs);
/* ----------------
* update pg_index
* (append INDEX tuple)
* Note that this stows away a representation of "predicate".
* (Or, could define a rule to maintain the predicate) --Nels, Feb '92
* ----------------
UpdateIndexRelation(indexoid, heapRelationId, indexInfo,
classObjectId, primary);
* Register constraint and dependencies for the index.
* If the index is from a CONSTRAINT clause, construct a pg_constraint
* entry. The index is then linked to the constraint, which in turn
* is linked to the table. If it's not a CONSTRAINT, make the
* dependency directly on the table.
* We don't need a dependency on the namespace, because there'll be an
* indirect dependency via our parent table.
* During bootstrap we can't register any dependencies, and we don't try
* to make a constraint either.
if (!IsBootstrapProcessingMode())
ObjectAddress myself,
myself.classId = RelOid_pg_class;
myself.objectId = indexoid;
myself.objectSubId = 0;
if (isconstraint)
char constraintType;
Oid conOid;
if (primary)
constraintType = CONSTRAINT_PRIMARY;
else if (indexInfo->ii_Unique)
constraintType = CONSTRAINT_UNIQUE;
elog(ERROR, "constraint must be PRIMARY or UNIQUE");
constraintType = 0; /* keep compiler quiet */
/* Shouldn't have any expressions */
if (indexInfo->ii_Expressions)
elog(ERROR, "constraints can't have index expressions");
conOid = CreateConstraintEntry(indexRelationName,
false, /* isDeferrable */
false, /* isDeferred */
InvalidOid, /* no domain */
InvalidOid, /* no foreign key */
' ',
' ',
' ',
InvalidOid, /* no associated index */
NULL, /* no check constraint */
referenced.classId = get_system_catalog_relid(ConstraintRelationName);
referenced.objectId = conOid;
referenced.objectSubId = 0;
recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
/* Create auto dependencies on simply-referenced columns */
for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
if (indexInfo->ii_KeyAttrNumbers[i] != 0)
referenced.classId = RelOid_pg_class;
referenced.objectId = heapRelationId;
referenced.objectSubId = indexInfo->ii_KeyAttrNumbers[i];
recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
/* Store dependency on operator classes */
referenced.classId = get_system_catalog_relid(OperatorClassRelationName);
for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
referenced.objectId = classObjectId[i];
referenced.objectSubId = 0;
recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
/* Store dependencies on anything mentioned in index expressions */
if (indexInfo->ii_Expressions)
(Node *) indexInfo->ii_Expressions,
/* Store dependencies on anything mentioned in predicate */
if (indexInfo->ii_Predicate)
(Node *) indexInfo->ii_Predicate,
* Fill in the index strategy structure with information from the
* catalogs. First we must advance the command counter so that we
* will see the newly-entered index catalog tuples.
* If this is bootstrap (initdb) time, then we don't actually fill in
* the index yet. We'll be creating more indexes and classes later,
* so we delay filling them in until just before we're done with
* bootstrapping. Similarly, if the caller specified skip_build then
* filling the index is delayed till later (ALTER TABLE can save work
* in some cases with this). Otherwise, we call the AM routine that
* constructs the index.
* In normal processing mode, the heap and index relations are closed,
* but we continue to hold the ShareLock on the heap and the exclusive
* lock on the index that we acquired above, until end of transaction.
if (IsBootstrapProcessingMode())
index_register(heapRelationId, indexoid, indexInfo);
/* XXX shouldn't we close the heap and index rels here? */
else if (skip_build)
/* caller is responsible for filling the index later on */
relation_close(indexRelation, NoLock);
heap_close(heapRelation, NoLock);
index_build(heapRelation, indexRelation, indexInfo);
/* index_build closes the passed rels */
return indexoid;
* index_drop
* NOTE: this routine should now only be called through performDeletion(),
* else associated dependencies won't be cleaned up.
index_drop(Oid indexId)
Oid heapId;
Relation userHeapRelation;
Relation userIndexRelation;
Relation indexRelation;
HeapTuple tuple;
bool hasexprs;
int i;
* To drop an index safely, we must grab exclusive lock on its parent
* table; otherwise there could be other backends using the index!
* Exclusive lock on the index alone is insufficient because another
* backend might be in the midst of devising a query plan that will
* use the index. The parser and planner take care to hold an
* appropriate lock on the parent table while working, but having them
* hold locks on all the indexes too seems overly complex. We do grab
* exclusive lock on the index too, just to be safe. Both locks must
* be held till end of transaction, else other backends will still see
* this index in pg_index.
heapId = IndexGetRelation(indexId);
userHeapRelation = heap_open(heapId, AccessExclusiveLock);
userIndexRelation = index_open(indexId);
LockRelation(userIndexRelation, AccessExclusiveLock);
* fix RELATION relation
* fix ATTRIBUTE relation
* fix INDEX relation, and check for expressional index
indexRelation = heap_openr(IndexRelationName, RowExclusiveLock);
tuple = SearchSysCache(INDEXRELID,
0, 0, 0);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for index %u", indexId);
hasexprs = !heap_attisnull(tuple, Anum_pg_index_indexprs);
simple_heap_delete(indexRelation, &tuple->t_self);
heap_close(indexRelation, RowExclusiveLock);
* if it has any expression columns, we might have stored
* statistics about them.
if (hasexprs)
RemoveStatistics(userIndexRelation, 0);
* flush buffer cache and physically remove the file
i = FlushRelationBuffers(userIndexRelation, (BlockNumber) 0);
if (i < 0)
elog(ERROR, "FlushRelationBuffers returned %d", i);
if (userIndexRelation->rd_smgr == NULL)
userIndexRelation->rd_smgr = smgropen(userIndexRelation->rd_node);
userIndexRelation->rd_smgr = NULL;
* We are presently too lazy to attempt to compute the new correct
* value of relhasindex (the next VACUUM will fix it if necessary). So
* there is no need to update the pg_class tuple for the owning
* relation. But we must send out a shared-cache-inval notice on the
* owning relation to ensure other backends update their relcache
* lists of indexes.
* Close rels, but keep locks
heap_close(userHeapRelation, NoLock);
/* ----------------------------------------------------------------
* index_build support
* ----------------------------------------------------------------
/* ----------------
* BuildIndexInfo
* Construct an IndexInfo record for an open index
* IndexInfo stores the information about the index that's needed by
* FormIndexDatum, which is used for both index_build() and later insertion
* of individual index tuples. Normally we build an IndexInfo for an index
* just once per command, and then use it for (potentially) many tuples.
* ----------------
IndexInfo *
BuildIndexInfo(Relation index)
IndexInfo *ii = makeNode(IndexInfo);
Form_pg_index indexStruct = index->rd_index;
int i;
int numKeys;
/* check the number of keys, and copy attr numbers into the IndexInfo */
numKeys = indexStruct->indnatts;
if (numKeys < 1 || numKeys > INDEX_MAX_KEYS)
elog(ERROR, "invalid indnatts %d for index %u",
numKeys, RelationGetRelid(index));
ii->ii_NumIndexAttrs = numKeys;
for (i = 0; i < numKeys; i++)
ii->ii_KeyAttrNumbers[i] = indexStruct->indkey[i];
/* fetch any expressions needed for expressional indexes */
ii->ii_Expressions = RelationGetIndexExpressions(index);
ii->ii_ExpressionsState = NIL;
/* fetch index predicate if any */
ii->ii_Predicate = RelationGetIndexPredicate(index);
ii->ii_PredicateState = NIL;
/* other info */
ii->ii_Unique = indexStruct->indisunique;
return ii;
/* ----------------
* FormIndexDatum
* Construct Datum[] and nullv[] arrays for a new index tuple.
* indexInfo Info about the index
* heapTuple Heap tuple for which we must prepare an index entry
* heapDescriptor tupledesc for heap tuple
* estate executor state for evaluating any index expressions
* datum Array of index Datums (output area)
* nullv Array of is-null indicators (output area)
* When there are no index expressions, estate may be NULL. Otherwise it
* must be supplied, *and* the ecxt_scantuple slot of its per-tuple expr
* context must point to the heap tuple passed in.
* For largely historical reasons, we don't actually call index_formtuple()
* here, we just prepare its input arrays datum[] and nullv[].
* ----------------
FormIndexDatum(IndexInfo *indexInfo,
HeapTuple heapTuple,
TupleDesc heapDescriptor,
EState *estate,
Datum *datum,
char *nullv)
List *indexprs;
int i;
if (indexInfo->ii_Expressions != NIL &&
indexInfo->ii_ExpressionsState == NIL)
/* First time through, set up expression evaluation state */
indexInfo->ii_ExpressionsState = (List *)
ExecPrepareExpr((Expr *) indexInfo->ii_Expressions,
/* Check caller has set up context correctly */
Assert(GetPerTupleExprContext(estate)->ecxt_scantuple->val == heapTuple);
indexprs = indexInfo->ii_ExpressionsState;
for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
int keycol = indexInfo->ii_KeyAttrNumbers[i];
Datum iDatum;
bool isNull;
if (keycol != 0)
* Plain index column; get the value we need directly from the
* heap tuple.
iDatum = heap_getattr(heapTuple, keycol, heapDescriptor, &isNull);
* Index expression --- need to evaluate it.
if (indexprs == NIL)
elog(ERROR, "wrong number of index expressions");
iDatum = ExecEvalExprSwitchContext((ExprState *) lfirst(indexprs),
indexprs = lnext(indexprs);
datum[i] = iDatum;
nullv[i] = (isNull) ? 'n' : ' ';
if (indexprs != NIL)
elog(ERROR, "wrong number of index expressions");
/* ----------------
* set relhasindex of relation's pg_class entry
* If isprimary is TRUE, we are defining a primary index, so also set
* relhaspkey to TRUE. Otherwise, leave relhaspkey alone.
* If reltoastidxid is not InvalidOid, also set reltoastidxid to that value.
* This is only used for TOAST relations.
* NOTE: an important side-effect of this operation is that an SI invalidation
* message is sent out to all backends --- including me --- causing relcache
* entries to be flushed or updated with the new hasindex data. This must
* happen even if we find that no change is needed in the pg_class row.
* ----------------
setRelhasindex(Oid relid, bool hasindex, bool isprimary, Oid reltoastidxid)
Relation pg_class;
HeapTuple tuple;
Form_pg_class classtuple;
bool dirty = false;
HeapScanDesc pg_class_scan = NULL;
* Find the tuple to update in pg_class. In bootstrap mode we can't
* use heap_update, so cheat and overwrite the tuple in-place. In
* normal processing, make a copy to scribble on.
pg_class = heap_openr(RelationRelationName, RowExclusiveLock);
if (!IsBootstrapProcessingMode())
tuple = SearchSysCacheCopy(RELOID,
0, 0, 0);
ScanKeyData key[1];
BTEqualStrategyNumber, F_OIDEQ,
pg_class_scan = heap_beginscan(pg_class, SnapshotNow, 1, key);
tuple = heap_getnext(pg_class_scan, ForwardScanDirection);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "could not find tuple for relation %u", relid);
classtuple = (Form_pg_class) GETSTRUCT(tuple);
/* Apply required updates */
if (pg_class_scan)
LockBuffer(pg_class_scan->rs_cbuf, BUFFER_LOCK_EXCLUSIVE);
if (classtuple->relhasindex != hasindex)
classtuple->relhasindex = hasindex;
dirty = true;
if (isprimary)
if (!classtuple->relhaspkey)
classtuple->relhaspkey = true;
dirty = true;
if (OidIsValid(reltoastidxid))
Assert(classtuple->relkind == RELKIND_TOASTVALUE);
if (classtuple->reltoastidxid != reltoastidxid)
classtuple->reltoastidxid = reltoastidxid;
dirty = true;
if (pg_class_scan)
LockBuffer(pg_class_scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
if (pg_class_scan)
/* Write the modified tuple in-place */
/* Send out shared cache inval if necessary */
if (!IsBootstrapProcessingMode())
CacheInvalidateHeapTuple(pg_class, tuple);
BufferSync(-1, -1);
else if (dirty)
simple_heap_update(pg_class, &tuple->t_self, tuple);
/* Keep the catalog indexes up to date */
CatalogUpdateIndexes(pg_class, tuple);
/* no need to change tuple, but force relcache rebuild anyway */
if (!pg_class_scan)
heap_close(pg_class, RowExclusiveLock);
* setNewRelfilenode - assign a new relfilenode value to the relation
* Caller must already hold exclusive lock on the relation.
setNewRelfilenode(Relation relation)
Oid newrelfilenode;
RelFileNode newrnode;
SMgrRelation srel;
Relation pg_class;
HeapTuple tuple;
Form_pg_class rd_rel;
/* Can't change relfilenode for nailed tables (indexes ok though) */
Assert(!relation->rd_isnailed ||
relation->rd_rel->relkind == RELKIND_INDEX);
/* Can't change for shared tables or indexes */
/* Allocate a new relfilenode */
newrelfilenode = newoid();
* Find the pg_class tuple for the given relation. This is not used
* during bootstrap, so okay to use heap_update always.
pg_class = heap_openr(RelationRelationName, RowExclusiveLock);
tuple = SearchSysCacheCopy(RELOID,
0, 0, 0);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "could not find tuple for relation %u",
rd_rel = (Form_pg_class) GETSTRUCT(tuple);
/* create another storage file. Is it a little ugly ? */
/* NOTE: any conflict in relfilenode value will be caught here */
newrnode = relation->rd_node;
newrnode.relNode = newrelfilenode;
srel = smgropen(newrnode);
smgrcreate(srel, relation->rd_istemp, false);
/* schedule unlinking old relfilenode */
if (relation->rd_smgr == NULL)
relation->rd_smgr = smgropen(relation->rd_node);
smgrscheduleunlink(relation->rd_smgr, relation->rd_istemp);
relation->rd_smgr = NULL;
/* update the pg_class row */
rd_rel->relfilenode = newrelfilenode;
simple_heap_update(pg_class, &tuple->t_self, tuple);
CatalogUpdateIndexes(pg_class, tuple);
heap_close(pg_class, RowExclusiveLock);
/* Make sure the relfilenode change is visible */
/* ----------------
* UpdateStats
* Update pg_class' relpages and reltuples statistics for the given relation
* (which can be either a table or an index). Note that this is not used
* in the context of VACUUM.
* ----------------
UpdateStats(Oid relid, double reltuples)
Relation whichRel;
Relation pg_class;
HeapTuple tuple;
BlockNumber relpages;
Form_pg_class rd_rel;
HeapScanDesc pg_class_scan = NULL;
bool in_place_upd;
* This routine handles updates for both the heap and index relation
* statistics. In order to guarantee that we're able to *see* the
* index relation tuple, we bump the command counter id here. The
* index relation tuple was created in the current transaction.
* CommandCounterIncrement() flushes invalid cache entries, including
* those for the heap and index relations for which we're updating
* statistics. Now that the cache is flushed, it's safe to open the
* relation again. We need the relation open in order to figure out
* how many blocks it contains.
* Grabbing lock here is probably redundant ...
whichRel = relation_open(relid, ShareLock);
* Find the tuple to update in pg_class. Normally we make a copy of
* the tuple using the syscache, modify it, and apply heap_update.
* But in bootstrap mode we can't use heap_update, so we cheat and
* overwrite the tuple in-place.
* We also must cheat if reindexing pg_class itself, because the
* target index may presently not be part of the set of indexes that
* CatalogUpdateIndexes would update (see reindex_relation). In this
* case the stats updates will not be WAL-logged and so could be lost
* in a crash. This seems OK considering VACUUM does the same thing.
pg_class = heap_openr(RelationRelationName, RowExclusiveLock);
in_place_upd = IsBootstrapProcessingMode() ||
if (!in_place_upd)
tuple = SearchSysCacheCopy(RELOID,
0, 0, 0);
ScanKeyData key[1];
BTEqualStrategyNumber, F_OIDEQ,
pg_class_scan = heap_beginscan(pg_class, SnapshotNow, 1, key);
tuple = heap_getnext(pg_class_scan, ForwardScanDirection);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "could not find tuple for relation %u", relid);
rd_rel = (Form_pg_class) GETSTRUCT(tuple);
* Figure values to insert.
* If we found zero tuples in the scan, do NOT believe it; instead put a
* bogus estimate into the statistics fields. Otherwise, the common
* pattern "CREATE TABLE; CREATE INDEX; insert data" leaves the table
* with zero size statistics until a VACUUM is done. The optimizer
* will generate very bad plans if the stats claim the table is empty
* when it is actually sizable. See also CREATE TABLE in heap.c.
* Note: this path is also taken during bootstrap, because bootstrap.c
* passes reltuples = 0 after loading a table. We have to estimate
* some number for reltuples based on the actual number of pages.
relpages = RelationGetNumberOfBlocks(whichRel);
if (reltuples == 0)
if (relpages == 0)
/* Bogus defaults for a virgin table, same as heap.c */
reltuples = 1000;
relpages = 10;
else if (whichRel->rd_rel->relkind == RELKIND_INDEX && relpages <= 2)
/* Empty index, leave bogus defaults in place */
reltuples = 1000;
reltuples = ((double) relpages) * NTUPLES_PER_PAGE(whichRel->rd_rel->relnatts);
* Update statistics in pg_class, if they changed. (Avoiding an
* unnecessary update is not just a tiny performance improvement; it
* also reduces the window wherein concurrent CREATE INDEX commands
* may conflict.)
if (rd_rel->relpages != (int32) relpages ||
rd_rel->reltuples != (float4) reltuples)
if (in_place_upd)
/* Bootstrap or reindex case: overwrite fields in place. */
LockBuffer(pg_class_scan->rs_cbuf, BUFFER_LOCK_EXCLUSIVE);
rd_rel->relpages = (int32) relpages;
rd_rel->reltuples = (float4) reltuples;
LockBuffer(pg_class_scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
if (!IsBootstrapProcessingMode())
CacheInvalidateHeapTuple(pg_class, tuple);
/* During normal processing, must work harder. */
rd_rel->relpages = (int32) relpages;
rd_rel->reltuples = (float4) reltuples;
simple_heap_update(pg_class, &tuple->t_self, tuple);
CatalogUpdateIndexes(pg_class, tuple);
if (!pg_class_scan)
* We shouldn't have to do this, but we do... Modify the reldesc in
* place with the new values so that the cache contains the latest
* copy. (XXX is this really still necessary? The relcache will get
* fixed at next CommandCounterIncrement, so why bother here?)
whichRel->rd_rel->relpages = (int32) relpages;
whichRel->rd_rel->reltuples = (float4) reltuples;
heap_close(pg_class, RowExclusiveLock);
relation_close(whichRel, NoLock);
* index_build - invoke access-method-specific index build procedure
index_build(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo)
RegProcedure procedure;
* sanity checks
procedure = indexRelation->rd_am->ambuild;
* Call the access method's build procedure
* IndexBuildHeapScan - scan the heap relation to find tuples to be indexed
* This is called back from an access-method-specific index build procedure
* after the AM has done whatever setup it needs. The parent heap relation
* is scanned to find tuples that should be entered into the index. Each
* such tuple is passed to the AM's callback routine, which does the right
* things to add it to the new index. After we return, the AM's index
* build procedure does whatever cleanup is needed; in particular, it should
* close the heap and index relations.
* The total count of heap tuples is returned. This is for updating pg_class
* statistics. (It's annoying not to be able to do that here, but we can't
* do it until after the relation is closed.) Note that the index AM itself
* must keep track of the number of index tuples; we don't do so here because
* the AM might reject some of the tuples for its own reasons, such as being
* unable to store NULLs.
IndexBuildHeapScan(Relation heapRelation,
Relation indexRelation,
IndexInfo *indexInfo,
IndexBuildCallback callback,
void *callback_state)
HeapScanDesc scan;
HeapTuple heapTuple;
TupleDesc heapDescriptor;
Datum attdata[INDEX_MAX_KEYS];
char nulls[INDEX_MAX_KEYS];
double reltuples;
List *predicate;
TupleTable tupleTable;
TupleTableSlot *slot;
EState *estate;
ExprContext *econtext;
Snapshot snapshot;
TransactionId OldestXmin;
* sanity checks
heapDescriptor = RelationGetDescr(heapRelation);
* Need an EState for evaluation of index expressions and
* partial-index predicates.
estate = CreateExecutorState();
econtext = GetPerTupleExprContext(estate);
* If this is a predicate (partial) index, we will need to evaluate
* the predicate using ExecQual, which requires the current tuple to
* be in a slot of a TupleTable. Likewise if there are any
* expressions.
if (indexInfo->ii_Predicate != NIL || indexInfo->ii_Expressions != NIL)
tupleTable = ExecCreateTupleTable(1);
slot = ExecAllocTableSlot(tupleTable);
ExecSetSlotDescriptor(slot, heapDescriptor, false);
/* Arrange for econtext's scan tuple to be the tuple under test */
econtext->ecxt_scantuple = slot;
/* Set up execution state for predicate. */
predicate = (List *)
ExecPrepareExpr((Expr *) indexInfo->ii_Predicate,
tupleTable = NULL;
slot = NULL;
predicate = NIL;
* Ok, begin our scan of the base relation. We use SnapshotAny
* because we must retrieve all tuples and do our own time qual
* checks.
if (IsBootstrapProcessingMode())
snapshot = SnapshotNow;
OldestXmin = InvalidTransactionId;
snapshot = SnapshotAny;
OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared);
scan = heap_beginscan(heapRelation, /* relation */
snapshot, /* seeself */
0, /* number of keys */
NULL); /* scan key */
reltuples = 0;
* Scan all tuples in the base relation.
while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
bool tupleIsAlive;
if (snapshot == SnapshotAny)
/* do our own time qual check */
bool indexIt;
uint16 sv_infomask;
* HeapTupleSatisfiesVacuum may update tuple's hint status
* bits. We could possibly get away with not locking the
* buffer here, since caller should hold ShareLock on the
* relation, but let's be conservative about it.
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
sv_infomask = heapTuple->t_data->t_infomask;
switch (HeapTupleSatisfiesVacuum(heapTuple->t_data, OldestXmin))
indexIt = false;
tupleIsAlive = false;
indexIt = true;
tupleIsAlive = true;
* If tuple is recently deleted then we must index it
* anyway to keep VACUUM from complaining.
indexIt = true;
tupleIsAlive = false;
* Since caller should hold ShareLock or better, we
* should not see any tuples inserted by open
* transactions --- unless it's our own transaction.
* (Consider INSERT followed by CREATE INDEX within a
* transaction.) An exception occurs when reindexing
* a system catalog, because we often release lock on
* system catalogs before committing.
if (!TransactionIdIsCurrentTransactionId(
&& !IsSystemRelation(heapRelation))
elog(ERROR, "concurrent insert in progress");
indexIt = true;
tupleIsAlive = true;
* Since caller should hold ShareLock or better, we
* should not see any tuples deleted by open
* transactions --- unless it's our own transaction.
* (Consider DELETE followed by CREATE INDEX within a
* transaction.) An exception occurs when reindexing
* a system catalog, because we often release lock on
* system catalogs before committing.
if (!TransactionIdIsCurrentTransactionId(
&& !IsSystemRelation(heapRelation))
elog(ERROR, "concurrent delete in progress");
indexIt = true;
tupleIsAlive = false;
elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
indexIt = tupleIsAlive = false; /* keep compiler quiet */
/* check for hint-bit update by HeapTupleSatisfiesVacuum */
if (sv_infomask != heapTuple->t_data->t_infomask)
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
if (!indexIt)
/* heap_getnext did the time qual check */
tupleIsAlive = true;
reltuples += 1;
/* Set up for predicate or expression evaluation */
if (slot)
ExecStoreTuple(heapTuple, slot, InvalidBuffer, false);
* In a partial index, discard tuples that don't satisfy the
* predicate. We can also discard recently-dead tuples, since
* VACUUM doesn't complain about tuple count mismatch for partial
* indexes.
if (predicate != NIL)
if (!tupleIsAlive)
if (!ExecQual(predicate, econtext, false))
* For the current heap tuple, extract all the attributes we use
* in this index, and note which are null. This also performs
* evaluation of any expressions needed.
* You'd think we should go ahead and build the index tuple here,
* but some index AMs want to do further processing on the data
* first. So pass the attdata and nulls arrays, instead.
/* Call the AM's callback routine to process the tuple */
callback(indexRelation, heapTuple, attdata, nulls, tupleIsAlive,
if (tupleTable)
ExecDropTupleTable(tupleTable, true);
/* These may have been pointing to the now-gone estate */
indexInfo->ii_ExpressionsState = NIL;
indexInfo->ii_PredicateState = NIL;
return reltuples;
* IndexGetRelation: given an index's relation OID, get the OID of the
* relation it is an index on. Uses the system cache.
static Oid
IndexGetRelation(Oid indexId)
HeapTuple tuple;
Form_pg_index index;
Oid result;
tuple = SearchSysCache(INDEXRELID,
0, 0, 0);
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for index %u", indexId);
index = (Form_pg_index) GETSTRUCT(tuple);
Assert(index->indexrelid == indexId);
result = index->indrelid;
return result;
* reindex_index - This routine is used to recreate a single index
reindex_index(Oid indexId)
Relation iRel,
IndexInfo *indexInfo;
Oid heapId;
bool inplace;
* Open our index relation and get an exclusive lock on it.
* Note: for REINDEX INDEX, doing this before opening the parent heap
* relation means there's a possibility for deadlock failure against
* another xact that is doing normal accesses to the heap and index.
* However, it's not real clear why you'd be wanting to do REINDEX INDEX
* on a table that's in active use, so I'd rather have the protection of
* making sure the index is locked down. In the REINDEX TABLE and
* REINDEX DATABASE cases, there is no problem because caller already
* holds exclusive lock on the parent table.
iRel = index_open(indexId);
LockRelation(iRel, AccessExclusiveLock);
/* Get OID of index's parent table */
heapId = iRel->rd_index->indrelid;
/* Open and lock the parent heap relation */
heapRelation = heap_open(heapId, AccessExclusiveLock);
SetReindexProcessing(heapId, indexId);
* If it's a shared index, we must do inplace processing (because we
* have no way to update relfilenode in other databases). Otherwise
* we can do it the normal transaction-safe way.
* Since inplace processing isn't crash-safe, we only allow it in a
* standalone backend. (In the REINDEX TABLE and REINDEX DATABASE cases,
* the caller should have detected this.)
inplace = iRel->rd_rel->relisshared;
if (inplace && IsUnderPostmaster)
errmsg("shared index \"%s\" can only be reindexed in stand-alone mode",
/* Fetch info needed for index_build */
indexInfo = BuildIndexInfo(iRel);
if (inplace)
* Release any buffers associated with this index. If they're
* dirty, they're just dropped without bothering to flush to disk.
/* Now truncate the actual data and set blocks to zero */
if (iRel->rd_smgr == NULL)
iRel->rd_smgr = smgropen(iRel->rd_node);
smgrtruncate(iRel->rd_smgr, 0);
iRel->rd_nblocks = 0;
iRel->rd_targblock = InvalidBlockNumber;
* We'll build a new physical relation for the index.
/* Initialize the index and rebuild */
index_build(heapRelation, iRel, indexInfo);
* index_build will close both the heap and index relations (but not
* give up the locks we hold on them). So we're done.
SetReindexProcessing(InvalidOid, InvalidOid);
* reindex_relation - This routine is used to recreate all indexes
* of a relation (and its toast relation too, if any).
* Returns true if any indexes were rebuilt.
reindex_relation(Oid relid)
Relation rel;
Oid toast_relid;
bool is_pg_class;
bool result;
List *indexIds,
* Ensure to hold an exclusive lock throughout the transaction. The
* lock could perhaps be less intensive (in the non-overwrite case)
* but for now it's AccessExclusiveLock for simplicity.
rel = heap_open(relid, AccessExclusiveLock);
toast_relid = rel->rd_rel->reltoastrelid;
* Get the list of index OIDs for this relation. (We trust to the
* relcache to get this with a sequential scan if ignoring system
* indexes.)
indexIds = RelationGetIndexList(rel);
* reindex_index will attempt to update the pg_class rows for the
* relation and index. If we are processing pg_class itself, we
* want to make sure that the updates do not try to insert index
* entries into indexes we have not processed yet. (When we are
* trying to recover from corrupted indexes, that could easily
* cause a crash.) We can accomplish this because CatalogUpdateIndexes
* will use the relcache's index list to know which indexes to update.
* We just force the index list to be only the stuff we've processed.
* It is okay to not insert entries into the indexes we have not
* processed yet because all of this is transaction-safe. If we fail
* partway through, the updated rows are dead and it doesn't matter
* whether they have index entries. Also, a new pg_class index will
* be created with an entry for its own pg_class row because we do
* setNewRelfilenode() before we do index_build().
is_pg_class = (RelationGetRelid(rel) == RelOid_pg_class);
doneIndexes = NIL;
/* Reindex all the indexes. */
foreach(indexId, indexIds)
Oid indexOid = lfirsto(indexId);
if (is_pg_class)
RelationSetIndexList(rel, doneIndexes);
if (is_pg_class)
doneIndexes = lappendo(doneIndexes, indexOid);
if (is_pg_class)
RelationSetIndexList(rel, indexIds);
* Close rel, but continue to hold the lock.
heap_close(rel, NoLock);
result = (indexIds != NIL);
* If the relation has a secondary toast rel, reindex that too while we
* still hold the lock on the master table.
if (toast_relid != InvalidOid)
result |= reindex_relation(toast_relid);
return result;