postgresql/src/backend/utils/cache/inval.c

1326 lines
41 KiB
C
Raw Normal View History

/*-------------------------------------------------------------------------
*
* inval.c
* POSTGRES cache invalidation dispatcher code.
*
* This is subtle stuff, so pay attention:
*
* When a tuple is updated or deleted, our standard time qualification rules
* consider that it is *still valid* so long as we are in the same command,
* ie, until the next CommandCounterIncrement() or transaction commit.
* (See utils/time/tqual.c, and note that system catalogs are generally
* scanned under SnapshotNow rules by the system, or plain user snapshots
2002-09-04 22:31:48 +02:00
* for user queries.) At the command boundary, the old tuple stops
* being valid and the new version, if any, becomes valid. Therefore,
* we cannot simply flush a tuple from the system caches during heap_update()
* or heap_delete(). The tuple is still good at that point; what's more,
* even if we did flush it, it might be reloaded into the caches by a later
* request in the same command. So the correct behavior is to keep a list
* of outdated (updated/deleted) tuples and then do the required cache
* flushes at the next command boundary. We must also keep track of
* inserted tuples so that we can flush "negative" cache entries that match
* the new tuples; again, that mustn't happen until end of command.
*
* Once we have finished the command, we still need to remember inserted
* tuples (including new versions of updated tuples), so that we can flush
* them from the caches if we abort the transaction. Similarly, we'd better
* be able to flush "negative" cache entries that may have been loaded in
* place of deleted tuples, so we still need the deleted ones too.
*
* If we successfully complete the transaction, we have to broadcast all
* these invalidation events to other backends (via the SI message queue)
2002-09-04 22:31:48 +02:00
* so that they can flush obsolete entries from their caches. Note we have
* to record the transaction commit before sending SI messages, otherwise
* the other backends won't see our updated tuples as good.
*
* When a subtransaction aborts, we can process and discard any events
2004-08-29 07:07:03 +02:00
* it has queued. When a subtransaction commits, we just add its events
* to the pending lists of the parent transaction.
*
* In short, we need to remember until xact end every insert or delete
2002-09-04 22:31:48 +02:00
* of a tuple that might be in the system caches. Updates are treated as
* two events, delete + insert, for simplicity. (There are cases where
* it'd be possible to record just one event, but we don't currently try.)
*
* We do not need to register EVERY tuple operation in this way, just those
* on tuples in relations that have associated catcaches. We do, however,
* have to register every operation on every tuple that *could* be in a
* catcache, whether or not it currently is in our cache. Also, if the
* tuple is in a relation that has multiple catcaches, we need to register
* an invalidation message for each such catcache. catcache.c's
* PrepareToInvalidateCacheTuple() routine provides the knowledge of which
* catcaches may need invalidation for a given tuple.
*
* Also, whenever we see an operation on a pg_class or pg_attribute tuple,
* we register a relcache flush operation for the relation described by that
* tuple.
*
* We keep the relcache flush requests in lists separate from the catcache
* tuple flush requests. This allows us to issue all the pending catcache
* flushes before we issue relcache flushes, which saves us from loading
* a catcache tuple during relcache load only to flush it again right away.
* Also, we avoid queuing multiple relcache flush requests for the same
* relation, since a relcache flush is relatively expensive to do.
* (XXX is it worth testing likewise for duplicate catcache flush entries?
* Probably not.)
*
* If a relcache flush is issued for a system relation that we preload
* from the relcache init file, we must also delete the init file so that
* it will be rebuilt during the next backend restart. The actual work of
* manipulating the init file is in relcache.c, but we keep track of the
* need for it here.
*
* The request lists proper are kept in CurTransactionContext of their
* creating (sub)transaction, since they can be forgotten on abort of that
* transaction but must be kept till top-level commit otherwise. For
* simplicity we keep the controlling list-of-lists in TopTransactionContext.
*
*
2010-01-02 17:58:17 +01:00
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.93 2010/02/03 01:14:17 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/twophase_rmgr.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "catalog/pg_tablespace.h"
1999-07-16 07:00:38 +02:00
#include "miscadmin.h"
#include "storage/sinval.h"
#include "storage/smgr.h"
#include "utils/inval.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/syscache.h"
/*
* To minimize palloc traffic, we keep pending requests in successively-
* larger chunks (a slightly more sophisticated version of an expansible
* array). All request types can be stored as SharedInvalidationMessage
* records. The ordering of requests within a list is never significant.
*/
typedef struct InvalidationChunk
{
struct InvalidationChunk *next; /* list link */
int nitems; /* # items currently stored in chunk */
int maxitems; /* size of allocated array in this chunk */
SharedInvalidationMessage msgs[1]; /* VARIABLE LENGTH ARRAY */
} InvalidationChunk; /* VARIABLE LENGTH STRUCTURE */
typedef struct InvalidationListHeader
{
InvalidationChunk *cclist; /* list of chunks holding catcache msgs */
InvalidationChunk *rclist; /* list of chunks holding relcache msgs */
} InvalidationListHeader;
2002-04-30 00:14:34 +02:00
/*----------------
* Invalidation info is divided into two lists:
* 1) events so far in current command, not yet reflected to caches.
* 2) events in previous commands of current transaction; these have
* been reflected to local caches, and must be either broadcast to
* other backends or rolled back from local cache when we commit
* or abort the transaction.
* Actually, we need two such lists for each level of nested transaction,
* so that we can discard events from an aborted subtransaction. When
* a subtransaction commits, we append its lists to the parent's lists.
*
* The relcache-file-invalidated flag can just be a simple boolean,
* since we only act on it at transaction commit; we don't care which
* command of the transaction set it.
2002-04-30 00:14:34 +02:00
*----------------
*/
typedef struct TransInvalidationInfo
{
/* Back link to parent transaction's info */
struct TransInvalidationInfo *parent;
/* Subtransaction nesting depth */
2005-10-15 04:49:52 +02:00
int my_level;
/* head of current-command event list */
InvalidationListHeader CurrentCmdInvalidMsgs;
/* head of previous-commands event list */
InvalidationListHeader PriorCmdInvalidMsgs;
/* init file must be invalidated? */
2004-08-29 07:07:03 +02:00
bool RelcacheInitFileInval;
} TransInvalidationInfo;
static TransInvalidationInfo *transInvalInfo = NULL;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
static SharedInvalidationMessage *SharedInvalidMessagesArray;
static int numSharedInvalidMessagesArray;
static int maxSharedInvalidMessagesArray;
2002-04-30 00:14:34 +02:00
/*
* Dynamically-registered callback functions. Current implementation
* assumes there won't be very many of these at once; could improve if needed.
*/
#define MAX_SYSCACHE_CALLBACKS 20
#define MAX_RELCACHE_CALLBACKS 5
2002-04-30 00:14:34 +02:00
static struct SYSCACHECALLBACK
2002-04-30 00:14:34 +02:00
{
int16 id; /* cache number */
SyscacheCallbackFunction function;
2002-04-30 00:14:34 +02:00
Datum arg;
} syscache_callback_list[MAX_SYSCACHE_CALLBACKS];
2002-04-30 00:14:34 +02:00
static int syscache_callback_count = 0;
static struct RELCACHECALLBACK
{
RelcacheCallbackFunction function;
Datum arg;
} relcache_callback_list[MAX_RELCACHE_CALLBACKS];
static int relcache_callback_count = 0;
2002-04-30 00:14:34 +02:00
/* ----------------------------------------------------------------
* Invalidation list support functions
*
* These three routines encapsulate processing of the "chunked"
* representation of what is logically just a list of messages.
* ----------------------------------------------------------------
*/
/*
* AddInvalidationMessage
* Add an invalidation message to a list (of chunks).
*
* Note that we do not pay any great attention to maintaining the original
* ordering of the messages.
*/
static void
AddInvalidationMessage(InvalidationChunk **listHdr,
SharedInvalidationMessage *msg)
{
InvalidationChunk *chunk = *listHdr;
if (chunk == NULL)
{
/* First time through; create initial chunk */
#define FIRSTCHUNKSIZE 32
chunk = (InvalidationChunk *)
MemoryContextAlloc(CurTransactionContext,
sizeof(InvalidationChunk) +
2005-10-15 04:49:52 +02:00
(FIRSTCHUNKSIZE - 1) *sizeof(SharedInvalidationMessage));
chunk->nitems = 0;
chunk->maxitems = FIRSTCHUNKSIZE;
chunk->next = *listHdr;
*listHdr = chunk;
}
else if (chunk->nitems >= chunk->maxitems)
{
/* Need another chunk; double size of last chunk */
int chunksize = 2 * chunk->maxitems;
chunk = (InvalidationChunk *)
MemoryContextAlloc(CurTransactionContext,
sizeof(InvalidationChunk) +
2005-10-15 04:49:52 +02:00
(chunksize - 1) *sizeof(SharedInvalidationMessage));
chunk->nitems = 0;
chunk->maxitems = chunksize;
chunk->next = *listHdr;
*listHdr = chunk;
}
/* Okay, add message to current chunk */
chunk->msgs[chunk->nitems] = *msg;
chunk->nitems++;
}
/*
* Append one list of invalidation message chunks to another, resetting
* the source chunk-list pointer to NULL.
*/
static void
AppendInvalidationMessageList(InvalidationChunk **destHdr,
InvalidationChunk **srcHdr)
{
InvalidationChunk *chunk = *srcHdr;
if (chunk == NULL)
return; /* nothing to do */
while (chunk->next != NULL)
chunk = chunk->next;
chunk->next = *destHdr;
*destHdr = *srcHdr;
*srcHdr = NULL;
}
/*
* Process a list of invalidation messages.
*
* This is a macro that executes the given code fragment for each message in
* a message chunk list. The fragment should refer to the message as *msg.
*/
#define ProcessMessageList(listHdr, codeFragment) \
do { \
InvalidationChunk *_chunk; \
for (_chunk = (listHdr); _chunk != NULL; _chunk = _chunk->next) \
{ \
int _cindex; \
for (_cindex = 0; _cindex < _chunk->nitems; _cindex++) \
{ \
SharedInvalidationMessage *msg = &_chunk->msgs[_cindex]; \
codeFragment; \
} \
} \
} while (0)
/*
* Process a list of invalidation messages group-wise.
*
* As above, but the code fragment can handle an array of messages.
* The fragment should refer to the messages as msgs[], with n entries.
*/
#define ProcessMessageListMulti(listHdr, codeFragment) \
do { \
InvalidationChunk *_chunk; \
for (_chunk = (listHdr); _chunk != NULL; _chunk = _chunk->next) \
{ \
SharedInvalidationMessage *msgs = _chunk->msgs; \
int n = _chunk->nitems; \
codeFragment; \
} \
} while (0)
/* ----------------------------------------------------------------
* Invalidation set support functions
*
* These routines understand about the division of a logical invalidation
* list into separate physical lists for catcache and relcache entries.
* ----------------------------------------------------------------
*/
/*
* Add a catcache inval entry
*/
static void
AddCatcacheInvalidationMessage(InvalidationListHeader *hdr,
int id, uint32 hashValue,
ItemPointer tuplePtr, Oid dbId)
{
SharedInvalidationMessage msg;
msg.cc.id = (int16) id;
msg.cc.tuplePtr = *tuplePtr;
msg.cc.dbId = dbId;
msg.cc.hashValue = hashValue;
AddInvalidationMessage(&hdr->cclist, &msg);
}
/*
* Add a relcache inval entry
*/
static void
AddRelcacheInvalidationMessage(InvalidationListHeader *hdr,
Oid dbId, Oid relId)
{
SharedInvalidationMessage msg;
/* Don't add a duplicate item */
/* We assume dbId need not be checked because it will never change */
ProcessMessageList(hdr->rclist,
if (msg->rc.id == SHAREDINVALRELCACHE_ID &&
msg->rc.relId == relId)
2005-10-15 04:49:52 +02:00
return);
/* OK, add the item */
msg.rc.id = SHAREDINVALRELCACHE_ID;
msg.rc.dbId = dbId;
msg.rc.relId = relId;
AddInvalidationMessage(&hdr->rclist, &msg);
}
/*
* Append one list of invalidation messages to another, resetting
* the source list to empty.
*/
static void
AppendInvalidationMessages(InvalidationListHeader *dest,
InvalidationListHeader *src)
{
AppendInvalidationMessageList(&dest->cclist, &src->cclist);
AppendInvalidationMessageList(&dest->rclist, &src->rclist);
}
/*
* Execute the given function for all the messages in an invalidation list.
* The list is not altered.
*
* catcache entries are processed first, for reasons mentioned above.
*/
static void
ProcessInvalidationMessages(InvalidationListHeader *hdr,
void (*func) (SharedInvalidationMessage *msg))
{
ProcessMessageList(hdr->cclist, func(msg));
ProcessMessageList(hdr->rclist, func(msg));
}
/*
* As above, but the function is able to process an array of messages
* rather than just one at a time.
*/
static void
ProcessInvalidationMessagesMulti(InvalidationListHeader *hdr,
void (*func) (const SharedInvalidationMessage *msgs, int n))
{
ProcessMessageListMulti(hdr->cclist, func(msgs, n));
ProcessMessageListMulti(hdr->rclist, func(msgs, n));
}
/* ----------------------------------------------------------------
* private support functions
* ----------------------------------------------------------------
*/
/*
* RegisterCatcacheInvalidation
*
* Register an invalidation event for a catcache tuple entry.
*/
static void
RegisterCatcacheInvalidation(int cacheId,
uint32 hashValue,
ItemPointer tuplePtr,
Oid dbId)
{
AddCatcacheInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs,
cacheId, hashValue, tuplePtr, dbId);
}
/*
* RegisterRelcacheInvalidation
*
* As above, but register a relcache invalidation event.
*/
static void
RegisterRelcacheInvalidation(Oid dbId, Oid relId)
{
AddRelcacheInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs,
dbId, relId);
2002-09-04 22:31:48 +02:00
/*
* Most of the time, relcache invalidation is associated with system
* catalog updates, but there are a few cases where it isn't. Quick hack
* to ensure that the next CommandCounterIncrement() will think that we
* need to do CommandEndInvalidationMessages().
*/
(void) GetCurrentCommandId(true);
/*
* If the relation being invalidated is one of those cached in the
* relcache init file, mark that we need to zap that file at commit.
*/
if (RelationIdIsInInitFile(relId))
transInvalInfo->RelcacheInitFileInval = true;
}
/*
* LocalExecuteInvalidationMessage
*
* Process a single invalidation message (which could be of any type).
* Only the local caches are flushed; this does not transmit the message
* to other backends.
*/
static void
LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg)
{
2002-04-30 00:14:34 +02:00
int i;
if (msg->id >= 0)
{
if (msg->cc.dbId == MyDatabaseId || msg->cc.dbId == 0)
2002-04-30 00:14:34 +02:00
{
CatalogCacheIdInvalidate(msg->cc.id,
msg->cc.hashValue,
&msg->cc.tuplePtr);
2002-04-30 00:14:34 +02:00
for (i = 0; i < syscache_callback_count; i++)
2002-04-30 00:14:34 +02:00
{
struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i;
2002-04-30 00:14:34 +02:00
if (ccitem->id == msg->cc.id)
(*ccitem->function) (ccitem->arg,
msg->cc.id, &msg->cc.tuplePtr);
2002-04-30 00:14:34 +02:00
}
}
}
else if (msg->id == SHAREDINVALRELCACHE_ID)
{
if (msg->rc.dbId == MyDatabaseId || msg->rc.dbId == InvalidOid)
2002-04-30 00:14:34 +02:00
{
RelationCacheInvalidateEntry(msg->rc.relId);
2002-04-30 00:14:34 +02:00
for (i = 0; i < relcache_callback_count; i++)
2002-04-30 00:14:34 +02:00
{
struct RELCACHECALLBACK *ccitem = relcache_callback_list + i;
2002-04-30 00:14:34 +02:00
(*ccitem->function) (ccitem->arg, msg->rc.relId);
2002-04-30 00:14:34 +02:00
}
}
}
else if (msg->id == SHAREDINVALSMGR_ID)
{
/*
2005-10-15 04:49:52 +02:00
* We could have smgr entries for relations of other databases, so no
* short-circuit test is possible here.
*/
smgrclosenode(msg->sm.rnode);
}
else
elog(FATAL, "unrecognized SI message id: %d", msg->id);
}
/*
* InvalidateSystemCaches
*
* This blows away all tuples in the system catalog caches and
* all the cached relation descriptors and smgr cache entries.
* Relation descriptors that have positive refcounts are then rebuilt.
*
* We call this when we see a shared-inval-queue overflow signal,
* since that tells us we've lost some shared-inval messages and hence
* don't know what needs to be invalidated.
*/
static void
InvalidateSystemCaches(void)
{
2002-04-30 00:14:34 +02:00
int i;
ResetCatalogCaches();
RelationCacheInvalidate(); /* gets smgr cache too */
2002-04-30 00:14:34 +02:00
for (i = 0; i < syscache_callback_count; i++)
2002-04-30 00:14:34 +02:00
{
struct SYSCACHECALLBACK *ccitem = syscache_callback_list + i;
(*ccitem->function) (ccitem->arg, ccitem->id, NULL);
}
for (i = 0; i < relcache_callback_count; i++)
{
struct RELCACHECALLBACK *ccitem = relcache_callback_list + i;
2002-04-30 00:14:34 +02:00
(*ccitem->function) (ccitem->arg, InvalidOid);
}
}
/*
* PrepareForTupleInvalidation
* Detect whether invalidation of this tuple implies invalidation
* of catalog/relation cache entries; if so, register inval events.
*/
static void
PrepareForTupleInvalidation(Relation relation, HeapTuple tuple)
{
Oid tupleRelId;
Oid databaseId;
Oid relationId;
/* Do nothing during bootstrap */
if (IsBootstrapProcessingMode())
return;
/*
2005-10-15 04:49:52 +02:00
* We only need to worry about invalidation for tuples that are in system
* relations; user-relation tuples are never in catcaches and can't affect
* the relcache either.
*/
if (!IsSystemRelation(relation))
return;
2002-09-04 22:31:48 +02:00
/*
2005-10-15 04:49:52 +02:00
* TOAST tuples can likewise be ignored here. Note that TOAST tables are
* considered system relations so they are not filtered by the above test.
*/
if (IsToastRelation(relation))
return;
/*
* First let the catcache do its thing
*/
PrepareToInvalidateCacheTuple(relation, tuple,
RegisterCatcacheInvalidation);
/*
* Now, is this tuple one of the primary definers of a relcache entry?
*/
tupleRelId = RelationGetRelid(relation);
if (tupleRelId == RelationRelationId)
{
Form_pg_class classtup = (Form_pg_class) GETSTRUCT(tuple);
relationId = HeapTupleGetOid(tuple);
if (classtup->relisshared)
databaseId = InvalidOid;
else
databaseId = MyDatabaseId;
}
else if (tupleRelId == AttributeRelationId)
{
Form_pg_attribute atttup = (Form_pg_attribute) GETSTRUCT(tuple);
relationId = atttup->attrelid;
2004-08-29 07:07:03 +02:00
/*
2005-10-15 04:49:52 +02:00
* KLUGE ALERT: we always send the relcache event with MyDatabaseId,
* even if the rel in question is shared (which we can't easily tell).
* This essentially means that only backends in this same database
* will react to the relcache flush request. This is in fact
2005-10-15 04:49:52 +02:00
* appropriate, since only those backends could see our pg_attribute
2007-11-15 22:14:46 +01:00
* change anyway. It looks a bit ugly though. (In practice, shared
* relations can't have schema changes after bootstrap, so we should
* never come here for a shared rel anyway.)
*/
databaseId = MyDatabaseId;
}
else if (tupleRelId == IndexRelationId)
{
Form_pg_index indextup = (Form_pg_index) GETSTRUCT(tuple);
/*
* When a pg_index row is updated, we should send out a relcache inval
2007-11-15 22:14:46 +01:00
* for the index relation. As above, we don't know the shared status
* of the index, but in practice it doesn't matter since indexes of
* shared catalogs can't have such updates.
*/
relationId = indextup->indexrelid;
databaseId = MyDatabaseId;
}
else
return;
/*
* Yes. We need to register a relcache invalidation event.
*/
RegisterRelcacheInvalidation(databaseId, relationId);
}
/* ----------------------------------------------------------------
* public functions
* ----------------------------------------------------------------
*/
/*
* AcceptInvalidationMessages
* Read and process invalidation messages from the shared invalidation
* message queue.
*
* Note:
* This should be called as the first step in processing a transaction.
*/
void
AcceptInvalidationMessages(void)
{
ReceiveSharedInvalidMessages(LocalExecuteInvalidationMessage,
InvalidateSystemCaches);
/*
* Test code to force cache flushes anytime a flush could happen.
*
* If used with CLOBBER_FREED_MEMORY, CLOBBER_CACHE_ALWAYS provides a
* fairly thorough test that the system contains no cache-flush hazards.
* However, it also makes the system unbelievably slow --- the regression
* tests take about 100 times longer than normal.
*
2006-10-04 02:30:14 +02:00
* If you're a glutton for punishment, try CLOBBER_CACHE_RECURSIVELY. This
* slows things by at least a factor of 10000, so I wouldn't suggest
* trying to run the entire regression tests that way. It's useful to try
* a few simple tests, to make sure that cache reload isn't subject to
* internal cache-flush hazards, but after you've done a few thousand
* recursive reloads it's unlikely you'll learn more.
*/
#if defined(CLOBBER_CACHE_ALWAYS)
{
static bool in_recursion = false;
if (!in_recursion)
{
in_recursion = true;
InvalidateSystemCaches();
in_recursion = false;
}
}
#elif defined(CLOBBER_CACHE_RECURSIVELY)
InvalidateSystemCaches();
#endif
}
/*
* AtStart_Inval
* Initialize inval lists at start of a main transaction.
*/
void
AtStart_Inval(void)
{
Assert(transInvalInfo == NULL);
transInvalInfo = (TransInvalidationInfo *)
MemoryContextAllocZero(TopTransactionContext,
sizeof(TransInvalidationInfo));
transInvalInfo->my_level = GetCurrentTransactionNestLevel();
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
SharedInvalidMessagesArray = NULL;
numSharedInvalidMessagesArray = 0;
}
/*
* PostPrepare_Inval
2005-10-15 04:49:52 +02:00
* Clean up after successful PREPARE.
*
* Here, we want to act as though the transaction aborted, so that we will
* undo any syscache changes it made, thereby bringing us into sync with the
* outside world, which doesn't believe the transaction committed yet.
*
* If the prepared transaction is later aborted, there is nothing more to
* do; if it commits, we will receive the consequent inval messages just
* like everyone else.
*/
void
PostPrepare_Inval(void)
{
AtEOXact_Inval(false);
}
/*
* AtSubStart_Inval
* Initialize inval lists at start of a subtransaction.
*/
void
AtSubStart_Inval(void)
{
TransInvalidationInfo *myInfo;
Assert(transInvalInfo != NULL);
myInfo = (TransInvalidationInfo *)
MemoryContextAllocZero(TopTransactionContext,
sizeof(TransInvalidationInfo));
myInfo->parent = transInvalInfo;
myInfo->my_level = GetCurrentTransactionNestLevel();
transInvalInfo = myInfo;
}
/*
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
* Collect invalidation messages into SharedInvalidMessagesArray array.
*/
static void
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
MakeSharedInvalidMessagesArray(const SharedInvalidationMessage *msgs, int n)
{
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* Initialise array first time through in each commit
*/
if (SharedInvalidMessagesArray == NULL)
{
maxSharedInvalidMessagesArray = FIRSTCHUNKSIZE;
numSharedInvalidMessagesArray = 0;
/*
* Although this is being palloc'd we don't actually free it directly.
* We're so close to EOXact that we now we're going to lose it anyhow.
*/
SharedInvalidMessagesArray = palloc(maxSharedInvalidMessagesArray
* sizeof(SharedInvalidationMessage));
}
if ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
{
while ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
maxSharedInvalidMessagesArray *= 2;
SharedInvalidMessagesArray = repalloc(SharedInvalidMessagesArray,
maxSharedInvalidMessagesArray
* sizeof(SharedInvalidationMessage));
}
/*
* Append the next chunk onto the array
*/
memcpy(SharedInvalidMessagesArray + numSharedInvalidMessagesArray,
msgs, n * sizeof(SharedInvalidationMessage));
numSharedInvalidMessagesArray += n;
}
/*
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
* xactGetCommittedInvalidationMessages() is executed by
* RecordTransactionCommit() to add invalidation messages onto the
* commit record. This applies only to commit message types, never to
* abort records. Must always run before AtEOXact_Inval(), since that
* removes the data we need to see.
*
* Remember that this runs before we have officially committed, so we
* must not do anything here to change what might occur *if* we should
* fail between here and the actual commit.
*
* see also xact_redo_commit() and xact_desc_commit()
*/
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
int
xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs,
bool *RelcacheInitFileInval)
{
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
MemoryContext oldcontext;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/* Must be at top of stack */
Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
/*
* Relcache init file invalidation requires processing both before and
* after we send the SI messages. However, we need not do anything
* unless we committed.
*/
*RelcacheInitFileInval = transInvalInfo->RelcacheInitFileInval;
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
/*
* Walk through TransInvalidationInfo to collect all the messages
* into a single contiguous array of invalidation messages. It must
* be contiguous so we can copy directly into WAL message. Maintain the
* order that they would be processed in by AtEOXact_Inval(), to ensure
* emulated behaviour in redo is as similar as possible to original.
* We want the same bugs, if any, not new ones.
*/
oldcontext = MemoryContextSwitchTo(CurTransactionContext);
ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
MakeSharedInvalidMessagesArray);
ProcessInvalidationMessagesMulti(&transInvalInfo->PriorCmdInvalidMsgs,
MakeSharedInvalidMessagesArray);
MemoryContextSwitchTo(oldcontext);
Assert(!(numSharedInvalidMessagesArray > 0 &&
SharedInvalidMessagesArray == NULL));
*msgs = SharedInvalidMessagesArray;
return numSharedInvalidMessagesArray;
}
#define RecoveryRelationCacheInitFileInvalidate(dbo, tbo, tf) \
{ \
DatabasePath = GetDatabasePath(dbo, tbo); \
elog(trace_recovery(DEBUG4), "removing relcache init file in %s", DatabasePath); \
RelationCacheInitFileInvalidate(tf); \
pfree(DatabasePath); \
}
/*
* ProcessCommittedInvalidationMessages is executed by xact_redo_commit()
* to process invalidation messages added to commit records.
*
* If we have to invalidate the relcache init file we need to extract
* the database id from each message so we can correctly locate the database
* path and so remove that database's init file. We note that the relcache
* only contains entries for catalog tables from a single database, or
* shared relations. There are smgr invalidations that reference other
* databases but they never cause relcache file invalidations.
* So we only need to access either global or default tablespaces and
* never have need to scan pg_database to discover tablespace oids.
*
* Relcache init file invalidation requires processing both
* before and after we send the SI messages. See AtEOXact_Inval()
*
* We deliberately avoid SetDatabasePath() since it is intended to be used
* only once by normal backends, so we set DatabasePath directly then
* pfree after use. See RecoveryRelationCacheInitFileInvalidate() macro.
*/
void
ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
int nmsgs, bool RelcacheInitFileInval)
{
Oid dboid = 0;
bool invalidate_global = false;
if (nmsgs > 0)
elog(trace_recovery(DEBUG4), "replaying commit with %d messages%s", nmsgs,
(RelcacheInitFileInval ? " and relcache file invalidation" : ""));
else
return;
if (RelcacheInitFileInval)
{
int i;
/*
* Check messages to record dboid
*/
for (i = 0; i < nmsgs; i++)
{
SharedInvalidationMessage *inval_msg = &(msgs[i]);
Oid loop_dboid = 0;
/*
* Extract the database Oid from the message
*/
if (inval_msg->id >= 0)
loop_dboid = inval_msg->cc.dbId;
else if (inval_msg->id == SHAREDINVALRELCACHE_ID)
loop_dboid = inval_msg->rc.dbId;
else
{
/*
* Invalidation message is a SHAREDINVALSMGR_ID
* which never cause relcache file invalidation,
* so we ignore them, no matter which db they're for.
*/
continue;
}
if (loop_dboid == 0)
invalidate_global = true;
else
{
Assert(dboid == 0 || dboid == loop_dboid);
dboid = loop_dboid;
}
}
/*
* If shared, dboid will be the global tablespace, otherwise it will
* be a local catalog relation in the default tablespace.
*/
if (invalidate_global)
RecoveryRelationCacheInitFileInvalidate(0, GLOBALTABLESPACE_OID, true);
if (dboid != 0)
RecoveryRelationCacheInitFileInvalidate(dboid, DEFAULTTABLESPACE_OID, true);
}
SendSharedInvalidMessages(msgs, nmsgs);
if (RelcacheInitFileInval)
{
/*
* Second invalidation, very similar to above. See RelationCacheInitFileInvalidate()
*/
if (invalidate_global)
RecoveryRelationCacheInitFileInvalidate(0, GLOBALTABLESPACE_OID, false);
if (dboid != 0)
RecoveryRelationCacheInitFileInvalidate(dboid, DEFAULTTABLESPACE_OID, false);
}
}
/*
* AtEOXact_Inval
* Process queued-up invalidation messages at end of main transaction.
*
* If isCommit, we must send out the messages in our PriorCmdInvalidMsgs list
* to the shared invalidation message queue. Note that these will be read
* not only by other backends, but also by our own backend at the next
2002-09-04 22:31:48 +02:00
* transaction start (via AcceptInvalidationMessages). This means that
* we can skip immediate local processing of anything that's still in
* CurrentCmdInvalidMsgs, and just send that list out too.
*
* If not isCommit, we are aborting, and must locally process the messages
2002-09-04 22:31:48 +02:00
* in PriorCmdInvalidMsgs. No messages need be sent to other backends,
* since they'll not have seen our changed tuples anyway. We can forget
* about CurrentCmdInvalidMsgs too, since those changes haven't touched
* the caches yet.
*
* In any case, reset the various lists to empty. We need not physically
* free memory here, since TopTransactionContext is about to be emptied
* anyway.
*
* Note:
* This should be called as the last step in processing a transaction.
*/
void
AtEOXact_Inval(bool isCommit)
{
if (isCommit)
{
/* Must be at top of stack */
Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
/*
2005-10-15 04:49:52 +02:00
* Relcache init file invalidation requires processing both before and
* after we send the SI messages. However, we need not do anything
* unless we committed.
*/
if (transInvalInfo->RelcacheInitFileInval)
RelationCacheInitFileInvalidate(true);
AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
2005-10-15 04:49:52 +02:00
&transInvalInfo->CurrentCmdInvalidMsgs);
ProcessInvalidationMessagesMulti(&transInvalInfo->PriorCmdInvalidMsgs,
SendSharedInvalidMessages);
if (transInvalInfo->RelcacheInitFileInval)
RelationCacheInitFileInvalidate(false);
}
else if (transInvalInfo != NULL)
{
/* Must be at top of stack */
Assert(transInvalInfo->parent == NULL);
ProcessInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
LocalExecuteInvalidationMessage);
}
/* Need not free anything explicitly */
transInvalInfo = NULL;
}
/*
* AtEOSubXact_Inval
* Process queued-up invalidation messages at end of subtransaction.
*
* If isCommit, process CurrentCmdInvalidMsgs if any (there probably aren't),
* and then attach both CurrentCmdInvalidMsgs and PriorCmdInvalidMsgs to the
* parent's PriorCmdInvalidMsgs list.
*
* If not isCommit, we are aborting, and must locally process the messages
* in PriorCmdInvalidMsgs. No messages need be sent to other backends.
* We can forget about CurrentCmdInvalidMsgs too, since those changes haven't
* touched the caches yet.
*
2004-08-29 07:07:03 +02:00
* In any case, pop the transaction stack. We need not physically free memory
* here, since CurTransactionContext is about to be emptied anyway
* (if aborting). Beware of the possibility of aborting the same nesting
* level twice, though.
*/
void
AtEOSubXact_Inval(bool isCommit)
{
int my_level = GetCurrentTransactionNestLevel();
TransInvalidationInfo *myInfo = transInvalInfo;
if (isCommit)
{
/* Must be at non-top of stack */
Assert(myInfo != NULL && myInfo->parent != NULL);
Assert(myInfo->my_level == my_level);
/* If CurrentCmdInvalidMsgs still has anything, fix it */
CommandEndInvalidationMessages();
/* Pass up my inval messages to parent */
AppendInvalidationMessages(&myInfo->parent->PriorCmdInvalidMsgs,
&myInfo->PriorCmdInvalidMsgs);
/* Pending relcache inval becomes parent's problem too */
if (myInfo->RelcacheInitFileInval)
myInfo->parent->RelcacheInitFileInval = true;
/* Pop the transaction state stack */
transInvalInfo = myInfo->parent;
/* Need not free anything else explicitly */
pfree(myInfo);
}
else if (myInfo != NULL && myInfo->my_level == my_level)
{
/* Must be at non-top of stack */
Assert(myInfo->parent != NULL);
ProcessInvalidationMessages(&myInfo->PriorCmdInvalidMsgs,
LocalExecuteInvalidationMessage);
/* Pop the transaction state stack */
transInvalInfo = myInfo->parent;
/* Need not free anything else explicitly */
pfree(myInfo);
}
}
/*
* CommandEndInvalidationMessages
* Process queued-up invalidation messages at end of one command
* in a transaction.
*
* Here, we send no messages to the shared queue, since we don't know yet if
* we will commit. We do need to locally process the CurrentCmdInvalidMsgs
* list, so as to flush our caches of any entries we have outdated in the
* current command. We then move the current-cmd list over to become part
* of the prior-cmds list.
*
* Note:
* This should be called during CommandCounterIncrement(),
* after we have advanced the command ID.
*/
void
CommandEndInvalidationMessages(void)
{
/*
2005-10-15 04:49:52 +02:00
* You might think this shouldn't be called outside any transaction, but
* bootstrap does it, and also ABORT issued when not in a transaction. So
* just quietly return if no state to work on.
*/
if (transInvalInfo == NULL)
return;
ProcessInvalidationMessages(&transInvalInfo->CurrentCmdInvalidMsgs,
LocalExecuteInvalidationMessage);
AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
&transInvalInfo->CurrentCmdInvalidMsgs);
}
/*
* BeginNonTransactionalInvalidation
* Prepare for invalidation messages for nontransactional updates.
*
* A nontransactional invalidation is one that must be sent whether or not
* the current transaction eventually commits. We arrange for all invals
* queued between this call and EndNonTransactionalInvalidation() to be sent
* immediately when the latter is called.
*
* Currently, this is only used by heap_page_prune(), and only when it is
* invoked during VACUUM FULL's first pass over a table. We expect therefore
* that we are not inside a subtransaction and there are no already-pending
* invalidations. This could be relaxed by setting up a new nesting level of
* invalidation data, but for now there's no need. Note that heap_page_prune
* knows that this function does not change any state, and therefore there's
* no need to worry about cleaning up if there's an elog(ERROR) before
* reaching EndNonTransactionalInvalidation (the invals will just be thrown
* away if that happens).
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
*
* Note that these are not replayed in standby mode.
*/
void
BeginNonTransactionalInvalidation(void)
{
/* Must be at top of stack */
Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
/* Must not have any previously-queued activity */
Assert(transInvalInfo->PriorCmdInvalidMsgs.cclist == NULL);
Assert(transInvalInfo->PriorCmdInvalidMsgs.rclist == NULL);
Assert(transInvalInfo->CurrentCmdInvalidMsgs.cclist == NULL);
Assert(transInvalInfo->CurrentCmdInvalidMsgs.rclist == NULL);
Assert(transInvalInfo->RelcacheInitFileInval == false);
Allow read only connections during recovery, known as Hot Standby. Enabled by recovery_connections = on (default) and forcing archive recovery using a recovery.conf. Recovery processing now emulates the original transactions as they are replayed, providing full locking and MVCC behaviour for read only queries. Recovery must enter consistent state before connections are allowed, so there is a delay, typically short, before connections succeed. Replay of recovering transactions can conflict and in some cases deadlock with queries during recovery; these result in query cancellation after max_standby_delay seconds have expired. Infrastructure changes have minor effects on normal running, though introduce four new types of WAL record. New test mode "make standbycheck" allows regression tests of static command behaviour on a standby server while in recovery. Typical and extreme dynamic behaviours have been checked via code inspection and manual testing. Few port specific behaviours have been utilised, though primary testing has been on Linux only so far. This commit is the basic patch. Additional changes will follow in this release to enhance some aspects of behaviour, notably improved handling of conflicts, deadlock detection and query cancellation. Changes to VACUUM FULL are also required. Simon Riggs, with significant and lengthy review by Heikki Linnakangas, including streamlined redesign of snapshot creation and two-phase commit. Important contributions from Florian Pflug, Mark Kirkwood, Merlin Moncure, Greg Stark, Gianni Ciolli, Gabriele Bartolini, Hannu Krosing, Robert Haas, Tatsuo Ishii, Hiroyuki Yamada plus support and feedback from many other community members.
2009-12-19 02:32:45 +01:00
SharedInvalidMessagesArray = NULL;
numSharedInvalidMessagesArray = 0;
}
/*
* EndNonTransactionalInvalidation
* Process queued-up invalidation messages for nontransactional updates.
*
* We expect to find messages in CurrentCmdInvalidMsgs only (else there
* was a CommandCounterIncrement within the "nontransactional" update).
* We must process them locally and send them out to the shared invalidation
* message queue.
*
* We must also reset the lists to empty and explicitly free memory (we can't
* rely on end-of-transaction cleanup for that).
*/
void
EndNonTransactionalInvalidation(void)
{
InvalidationChunk *chunk;
InvalidationChunk *next;
/* Must be at top of stack */
Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
/* Must not have any prior-command messages */
Assert(transInvalInfo->PriorCmdInvalidMsgs.cclist == NULL);
Assert(transInvalInfo->PriorCmdInvalidMsgs.rclist == NULL);
/*
* At present, this function is only used for CTID-changing updates; since
* the relcache init file doesn't store any tuple CTIDs, we don't have to
* invalidate it. That might not be true forever though, in which case
* we'd need code similar to AtEOXact_Inval.
*/
/* Send out the invals */
ProcessInvalidationMessages(&transInvalInfo->CurrentCmdInvalidMsgs,
LocalExecuteInvalidationMessage);
ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
SendSharedInvalidMessages);
/* Clean up and release memory */
for (chunk = transInvalInfo->CurrentCmdInvalidMsgs.cclist;
chunk != NULL;
chunk = next)
{
next = chunk->next;
pfree(chunk);
}
for (chunk = transInvalInfo->CurrentCmdInvalidMsgs.rclist;
chunk != NULL;
chunk = next)
{
next = chunk->next;
pfree(chunk);
}
transInvalInfo->CurrentCmdInvalidMsgs.cclist = NULL;
transInvalInfo->CurrentCmdInvalidMsgs.rclist = NULL;
transInvalInfo->RelcacheInitFileInval = false;
}
/*
* CacheInvalidateHeapTuple
* Register the given tuple for invalidation at end of command
* (ie, current command is creating or outdating this tuple).
*/
void
CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple)
{
PrepareForTupleInvalidation(relation, tuple);
}
/*
* CacheInvalidateRelcache
* Register invalidation of the specified relation's relcache entry
* at end of command.
*
* This is used in places that need to force relcache rebuild but aren't
* changing any of the tuples recognized as contributors to the relcache
* entry by PrepareForTupleInvalidation. (An example is dropping an index.)
*/
void
CacheInvalidateRelcache(Relation relation)
{
Oid databaseId;
Oid relationId;
relationId = RelationGetRelid(relation);
if (relation->rd_rel->relisshared)
databaseId = InvalidOid;
else
databaseId = MyDatabaseId;
RegisterRelcacheInvalidation(databaseId, relationId);
}
/*
* CacheInvalidateRelcacheByTuple
* As above, but relation is identified by passing its pg_class tuple.
*/
void
CacheInvalidateRelcacheByTuple(HeapTuple classTuple)
{
Form_pg_class classtup = (Form_pg_class) GETSTRUCT(classTuple);
Oid databaseId;
Oid relationId;
relationId = HeapTupleGetOid(classTuple);
if (classtup->relisshared)
databaseId = InvalidOid;
else
databaseId = MyDatabaseId;
RegisterRelcacheInvalidation(databaseId, relationId);
}
2002-04-30 00:14:34 +02:00
/*
* CacheInvalidateRelcacheByRelid
* As above, but relation is identified by passing its OID.
* This is the least efficient of the three options; use one of
* the above routines if you have a Relation or pg_class tuple.
*/
void
CacheInvalidateRelcacheByRelid(Oid relid)
{
HeapTuple tup;
tup = SearchSysCache(RELOID,
ObjectIdGetDatum(relid),
0, 0, 0);
if (!HeapTupleIsValid(tup))
elog(ERROR, "cache lookup failed for relation %u", relid);
CacheInvalidateRelcacheByTuple(tup);
ReleaseSysCache(tup);
}
/*
* CacheInvalidateSmgr
* Register invalidation of smgr references to a physical relation.
*
* Sending this type of invalidation msg forces other backends to close open
* smgr entries for the rel. This should be done to flush dangling open-file
* references when the physical rel is being dropped or truncated. Because
* these are nontransactional (i.e., not-rollback-able) operations, we just
* send the inval message immediately without any queuing.
*
* Note: in most cases there will have been a relcache flush issued against
* the rel at the logical level. We need a separate smgr-level flush because
* it is possible for backends to have open smgr entries for rels they don't
* have a relcache entry for, e.g. because the only thing they ever did with
* the rel is write out dirty shared buffers.
*
* Note: because these messages are nontransactional, they won't be captured
* in commit/abort WAL entries. Instead, calls to CacheInvalidateSmgr()
* should happen in low-level smgr.c routines, which are executed while
* replaying WAL as well as when creating it.
*/
void
CacheInvalidateSmgr(RelFileNode rnode)
{
SharedInvalidationMessage msg;
msg.sm.id = SHAREDINVALSMGR_ID;
msg.sm.rnode = rnode;
SendSharedInvalidMessages(&msg, 1);
}
2002-04-30 00:14:34 +02:00
/*
* CacheRegisterSyscacheCallback
* Register the specified function to be called for all future
* invalidation events in the specified cache. The cache ID and the
* TID of the tuple being invalidated will be passed to the function.
2002-04-30 00:14:34 +02:00
*
* NOTE: NULL will be passed for the TID if a cache reset request is received.
* In this case the called routines should flush all cached state.
2002-04-30 00:14:34 +02:00
*/
void
CacheRegisterSyscacheCallback(int cacheid,
SyscacheCallbackFunction func,
2002-04-30 00:14:34 +02:00
Datum arg)
{
if (syscache_callback_count >= MAX_SYSCACHE_CALLBACKS)
elog(FATAL, "out of syscache_callback_list slots");
2002-04-30 00:14:34 +02:00
syscache_callback_list[syscache_callback_count].id = cacheid;
syscache_callback_list[syscache_callback_count].function = func;
syscache_callback_list[syscache_callback_count].arg = arg;
2002-04-30 00:14:34 +02:00
++syscache_callback_count;
2002-04-30 00:14:34 +02:00
}
/*
* CacheRegisterRelcacheCallback
* Register the specified function to be called for all future
* relcache invalidation events. The OID of the relation being
* invalidated will be passed to the function.
*
* NOTE: InvalidOid will be passed if a cache reset request is received.
* In this case the called routines should flush all cached state.
*/
void
CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
2002-04-30 00:14:34 +02:00
Datum arg)
{
if (relcache_callback_count >= MAX_RELCACHE_CALLBACKS)
elog(FATAL, "out of relcache_callback_list slots");
2002-04-30 00:14:34 +02:00
relcache_callback_list[relcache_callback_count].function = func;
relcache_callback_list[relcache_callback_count].arg = arg;
2002-04-30 00:14:34 +02:00
++relcache_callback_count;
2002-04-30 00:14:34 +02:00
}