2008-11-19 11:34:52 +01:00
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* storage.c
|
|
|
|
* code to create and destroy physical storage for relations
|
|
|
|
*
|
2015-01-06 17:43:47 +01:00
|
|
|
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
|
2008-11-19 11:34:52 +01:00
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
2010-09-20 22:08:53 +02:00
|
|
|
* src/backend/catalog/storage.c
|
2008-11-19 11:34:52 +01:00
|
|
|
*
|
|
|
|
* NOTES
|
|
|
|
* Some of this code used to be in storage/smgr/smgr.c, and the
|
|
|
|
* function names still reflect that.
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
|
2008-12-03 14:05:22 +01:00
|
|
|
#include "access/visibilitymap.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "access/xact.h"
|
2014-11-06 12:52:08 +01:00
|
|
|
#include "access/xlog.h"
|
|
|
|
#include "access/xloginsert.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "access/xlogutils.h"
|
|
|
|
#include "catalog/catalog.h"
|
|
|
|
#include "catalog/storage.h"
|
2012-11-28 16:35:01 +01:00
|
|
|
#include "catalog/storage_xlog.h"
|
2008-11-19 11:34:52 +01:00
|
|
|
#include "storage/freespace.h"
|
|
|
|
#include "storage/smgr.h"
|
|
|
|
#include "utils/memutils.h"
|
|
|
|
#include "utils/rel.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We keep a list of all relations (represented as RelFileNode values)
|
|
|
|
* that have been created or deleted in the current transaction. When
|
|
|
|
* a relation is created, we create the physical file immediately, but
|
|
|
|
* remember it so that we can delete the file again if the current
|
2014-05-06 18:12:18 +02:00
|
|
|
* transaction is aborted. Conversely, a deletion request is NOT
|
2008-11-19 11:34:52 +01:00
|
|
|
* executed immediately, but is just entered in the list. When and if
|
|
|
|
* the transaction commits, we can delete the physical file.
|
|
|
|
*
|
|
|
|
* To handle subtransactions, every entry is marked with its transaction
|
|
|
|
* nesting level. At subtransaction commit, we reassign the subtransaction's
|
|
|
|
* entries to the parent nesting level. At subtransaction abort, we can
|
|
|
|
* immediately execute the abort-time actions for all entries of the current
|
|
|
|
* nesting level.
|
|
|
|
*
|
|
|
|
* NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
|
|
|
|
* unbetimes. It'd probably be OK to keep it in TopTransactionContext,
|
|
|
|
* but I'm being paranoid.
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef struct PendingRelDelete
|
|
|
|
{
|
|
|
|
RelFileNode relnode; /* relation that may need to be deleted */
|
2010-08-13 22:10:54 +02:00
|
|
|
BackendId backend; /* InvalidBackendId if not a temp rel */
|
2008-11-19 11:34:52 +01:00
|
|
|
bool atCommit; /* T=delete at commit; F=delete at abort */
|
|
|
|
int nestLevel; /* xact nesting level of request */
|
|
|
|
struct PendingRelDelete *next; /* linked-list link */
|
|
|
|
} PendingRelDelete;
|
|
|
|
|
|
|
|
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RelationCreateStorage
|
|
|
|
* Create physical storage for a relation.
|
|
|
|
*
|
|
|
|
* Create the underlying disk file storage for the relation. This only
|
|
|
|
* creates the main fork; additional forks are created lazily by the
|
|
|
|
* modules that need them.
|
|
|
|
*
|
|
|
|
* This function is transactional. The creation is WAL-logged, and if the
|
|
|
|
* transaction aborts later on, the storage will be destroyed.
|
|
|
|
*/
|
|
|
|
void
|
2010-12-13 18:34:26 +01:00
|
|
|
RelationCreateStorage(RelFileNode rnode, char relpersistence)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
SMgrRelation srel;
|
2010-12-13 18:34:26 +01:00
|
|
|
BackendId backend;
|
|
|
|
bool needs_wal;
|
|
|
|
|
|
|
|
switch (relpersistence)
|
|
|
|
{
|
|
|
|
case RELPERSISTENCE_TEMP:
|
|
|
|
backend = MyBackendId;
|
|
|
|
needs_wal = false;
|
|
|
|
break;
|
2010-12-29 12:48:53 +01:00
|
|
|
case RELPERSISTENCE_UNLOGGED:
|
|
|
|
backend = InvalidBackendId;
|
|
|
|
needs_wal = false;
|
|
|
|
break;
|
2010-12-13 18:34:26 +01:00
|
|
|
case RELPERSISTENCE_PERMANENT:
|
|
|
|
backend = InvalidBackendId;
|
|
|
|
needs_wal = true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
elog(ERROR, "invalid relpersistence: %c", relpersistence);
|
2011-04-10 17:42:00 +02:00
|
|
|
return; /* placate compiler */
|
2010-12-13 18:34:26 +01:00
|
|
|
}
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
srel = smgropen(rnode, backend);
|
2008-11-19 11:34:52 +01:00
|
|
|
smgrcreate(srel, MAIN_FORKNUM, false);
|
|
|
|
|
2010-12-13 18:34:26 +01:00
|
|
|
if (needs_wal)
|
2010-12-29 12:48:53 +01:00
|
|
|
log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/* Add the relation to the list of stuff to delete at abort */
|
|
|
|
pending = (PendingRelDelete *)
|
|
|
|
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
|
|
|
|
pending->relnode = rnode;
|
2010-08-13 22:10:54 +02:00
|
|
|
pending->backend = backend;
|
2008-11-19 11:34:52 +01:00
|
|
|
pending->atCommit = false; /* delete if abort */
|
|
|
|
pending->nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
pending->next = pendingDeletes;
|
|
|
|
pendingDeletes = pending;
|
|
|
|
}
|
|
|
|
|
2010-12-29 12:48:53 +01:00
|
|
|
/*
|
|
|
|
* Perform XLogInsert of a XLOG_SMGR_CREATE record to WAL.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
|
|
|
|
{
|
|
|
|
xl_smgr_create xlrec;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make an XLOG entry reporting the file creation.
|
|
|
|
*/
|
|
|
|
xlrec.rnode = *rnode;
|
|
|
|
xlrec.forkNum = forkNum;
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
|
|
|
|
XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
|
2010-12-29 12:48:53 +01:00
|
|
|
}
|
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
/*
|
|
|
|
* RelationDropStorage
|
|
|
|
* Schedule unlinking of physical storage at transaction commit.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
RelationDropStorage(Relation rel)
|
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
|
|
|
|
/* Add the relation to the list of stuff to delete at commit */
|
|
|
|
pending = (PendingRelDelete *)
|
|
|
|
MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
|
|
|
|
pending->relnode = rel->rd_node;
|
2010-08-13 22:10:54 +02:00
|
|
|
pending->backend = rel->rd_backend;
|
2008-11-19 11:34:52 +01:00
|
|
|
pending->atCommit = true; /* delete if commit */
|
|
|
|
pending->nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
pending->next = pendingDeletes;
|
|
|
|
pendingDeletes = pending;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NOTE: if the relation was created in this transaction, it will now be
|
|
|
|
* present in the pending-delete list twice, once with atCommit true and
|
|
|
|
* once with atCommit false. Hence, it will be physically deleted at end
|
|
|
|
* of xact in either case (and the other entry will be ignored by
|
|
|
|
* smgrDoPendingDeletes, so no error will occur). We could instead remove
|
|
|
|
* the existing list entry and delete the physical file immediately, but
|
|
|
|
* for now I'll keep the logic simple.
|
|
|
|
*/
|
|
|
|
|
|
|
|
RelationCloseSmgr(rel);
|
|
|
|
}
|
|
|
|
|
2010-02-07 21:48:13 +01:00
|
|
|
/*
|
|
|
|
* RelationPreserveStorage
|
|
|
|
* Mark a relation as not to be deleted after all.
|
|
|
|
*
|
|
|
|
* We need this function because relation mapping changes are committed
|
|
|
|
* separately from commit of the whole transaction, so it's still possible
|
|
|
|
* for the transaction to abort after the mapping update is done.
|
|
|
|
* When a new physical relation is installed in the map, it would be
|
|
|
|
* scheduled for delete-on-abort, so we'd delete it, and be in trouble.
|
|
|
|
* The relation mapper fixes this by telling us to not delete such relations
|
|
|
|
* after all as part of its commit.
|
|
|
|
*
|
2011-07-18 17:02:48 +02:00
|
|
|
* We also use this to reuse an old build of an index during ALTER TABLE, this
|
|
|
|
* time removing the delete-at-commit entry.
|
|
|
|
*
|
2010-02-07 21:48:13 +01:00
|
|
|
* No-op if the relation is not among those scheduled for deletion.
|
|
|
|
*/
|
|
|
|
void
|
2011-07-18 17:02:48 +02:00
|
|
|
RelationPreserveStorage(RelFileNode rnode, bool atCommit)
|
2010-02-07 21:48:13 +01:00
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
PendingRelDelete *prev;
|
|
|
|
PendingRelDelete *next;
|
|
|
|
|
|
|
|
prev = NULL;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = next)
|
|
|
|
{
|
|
|
|
next = pending->next;
|
2011-07-18 17:02:48 +02:00
|
|
|
if (RelFileNodeEquals(rnode, pending->relnode)
|
|
|
|
&& pending->atCommit == atCommit)
|
2010-02-07 21:48:13 +01:00
|
|
|
{
|
|
|
|
/* unlink and delete list entry */
|
|
|
|
if (prev)
|
|
|
|
prev->next = next;
|
|
|
|
else
|
|
|
|
pendingDeletes = next;
|
|
|
|
pfree(pending);
|
|
|
|
/* prev does not change */
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* unrelated entry, don't touch it */
|
|
|
|
prev = pending;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
/*
|
|
|
|
* RelationTruncate
|
|
|
|
* Physically truncate a relation to the specified number of blocks.
|
|
|
|
*
|
|
|
|
* This includes getting rid of any buffers for the blocks that are to be
|
2010-02-08 20:59:49 +01:00
|
|
|
* dropped.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
RelationTruncate(Relation rel, BlockNumber nblocks)
|
|
|
|
{
|
2009-06-11 16:49:15 +02:00
|
|
|
bool fsm;
|
|
|
|
bool vm;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/* Open it at the smgr level if not already done */
|
|
|
|
RelationOpenSmgr(rel);
|
|
|
|
|
2010-02-09 22:43:30 +01:00
|
|
|
/*
|
|
|
|
* Make sure smgr_targblock etc aren't pointing somewhere past new end
|
|
|
|
*/
|
|
|
|
rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
|
|
|
|
rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
|
|
|
|
rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/* Truncate the FSM first if it exists */
|
|
|
|
fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
|
|
|
|
if (fsm)
|
|
|
|
FreeSpaceMapTruncateRel(rel, nblocks);
|
|
|
|
|
2008-12-03 14:05:22 +01:00
|
|
|
/* Truncate the visibility map too if it exists. */
|
|
|
|
vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
|
|
|
|
if (vm)
|
|
|
|
visibilitymap_truncate(rel, nblocks);
|
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* We WAL-log the truncation before actually truncating, which means
|
|
|
|
* trouble if the truncation fails. If we then crash, the WAL replay
|
|
|
|
* likely isn't going to succeed in the truncation either, and cause a
|
|
|
|
* PANIC. It's tempting to put a critical section here, but that cure
|
|
|
|
* would be worse than the disease. It would turn a usually harmless
|
2010-02-07 21:48:13 +01:00
|
|
|
* failure to truncate, that might spell trouble at WAL replay, into a
|
2009-06-11 16:49:15 +02:00
|
|
|
* certain PANIC.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
2010-12-13 18:34:26 +01:00
|
|
|
if (RelationNeedsWAL(rel))
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
/*
|
2010-02-07 21:48:13 +01:00
|
|
|
* Make an XLOG entry reporting the file truncation.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
XLogRecPtr lsn;
|
|
|
|
xl_smgr_truncate xlrec;
|
|
|
|
|
|
|
|
xlrec.blkno = nblocks;
|
|
|
|
xlrec.rnode = rel->rd_node;
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogBeginInsert();
|
|
|
|
XLogRegisterData((char *) &xlrec, sizeof(xlrec));
|
2008-11-19 11:34:52 +01:00
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
lsn = XLogInsert(RM_SMGR_ID,
|
|
|
|
XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/*
|
2009-06-11 16:49:15 +02:00
|
|
|
* Flush, because otherwise the truncation of the main relation might
|
|
|
|
* hit the disk before the WAL record, and the truncation of the FSM
|
|
|
|
* or visibility map. If we crashed during that window, we'd be left
|
|
|
|
* with a truncated heap, but the FSM or visibility map would still
|
|
|
|
* contain entries for the non-existent heap pages.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
2008-12-03 14:05:22 +01:00
|
|
|
if (fsm || vm)
|
2008-11-19 11:34:52 +01:00
|
|
|
XLogFlush(lsn);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do the real work */
|
2010-08-13 22:10:54 +02:00
|
|
|
smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
|
|
|
|
*
|
|
|
|
* This also runs when aborting a subxact; we want to clean up a failed
|
|
|
|
* subxact immediately.
|
2010-08-13 22:10:54 +02:00
|
|
|
*
|
|
|
|
* Note: It's possible that we're being asked to remove a relation that has
|
|
|
|
* no physical storage in any fork. In particular, it's possible that we're
|
|
|
|
* cleaning up an old temporary relation for which RemovePgTempFiles has
|
|
|
|
* already recovered the physical storage.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
smgrDoPendingDeletes(bool isCommit)
|
|
|
|
{
|
|
|
|
int nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
PendingRelDelete *prev;
|
|
|
|
PendingRelDelete *next;
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
int nrels = 0,
|
|
|
|
i = 0,
|
2013-12-20 16:37:30 +01:00
|
|
|
maxrels = 0;
|
|
|
|
SMgrRelation *srels = NULL;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
prev = NULL;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = next)
|
|
|
|
{
|
|
|
|
next = pending->next;
|
|
|
|
if (pending->nestLevel < nestLevel)
|
|
|
|
{
|
|
|
|
/* outer-level entries should not be processed yet */
|
|
|
|
prev = pending;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* unlink list entry first, so we don't retry on failure */
|
|
|
|
if (prev)
|
|
|
|
prev->next = next;
|
|
|
|
else
|
|
|
|
pendingDeletes = next;
|
|
|
|
/* do deletion if called for */
|
|
|
|
if (pending->atCommit == isCommit)
|
|
|
|
{
|
|
|
|
SMgrRelation srel;
|
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
srel = smgropen(pending->relnode, pending->backend);
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
|
2013-12-20 16:37:30 +01:00
|
|
|
/* allocate the initial array, or extend it, if needed */
|
|
|
|
if (maxrels == 0)
|
|
|
|
{
|
|
|
|
maxrels = 8;
|
2014-05-06 18:12:18 +02:00
|
|
|
srels = palloc(sizeof(SMgrRelation) * maxrels);
|
2013-12-20 16:37:30 +01:00
|
|
|
}
|
|
|
|
else if (maxrels <= nrels)
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
{
|
|
|
|
maxrels *= 2;
|
|
|
|
srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
|
|
|
|
}
|
|
|
|
|
|
|
|
srels[nrels++] = srel;
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
/* must explicitly free the list entry */
|
|
|
|
pfree(pending);
|
|
|
|
/* prev does not change */
|
|
|
|
}
|
|
|
|
}
|
Accelerate end-of-transaction dropping of relations
When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations. Previously we would scan the buffer pool once per relation
to clean up buffers. When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list. When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster. The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.
This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).
Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
2013-01-17 19:55:10 +01:00
|
|
|
|
|
|
|
if (nrels > 0)
|
|
|
|
{
|
|
|
|
smgrdounlinkall(srels, nrels, false);
|
|
|
|
|
|
|
|
for (i = 0; i < nrels; i++)
|
|
|
|
smgrclose(srels[i]);
|
|
|
|
|
2013-12-20 16:37:30 +01:00
|
|
|
pfree(srels);
|
|
|
|
}
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2010-08-13 22:10:54 +02:00
|
|
|
* smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
|
2008-11-19 11:34:52 +01:00
|
|
|
*
|
|
|
|
* The return value is the number of relations scheduled for termination.
|
|
|
|
* *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
|
|
|
|
* If there are no relations to be deleted, *ptr is set to NULL.
|
|
|
|
*
|
2014-05-06 18:12:18 +02:00
|
|
|
* Only non-temporary relations are included in the returned list. This is OK
|
2010-08-13 22:10:54 +02:00
|
|
|
* because the list is used only in contexts where temporary relations don't
|
|
|
|
* matter: we're either writing to the two-phase state file (and transactions
|
|
|
|
* that have touched temp tables can't be prepared) or we're writing to xlog
|
|
|
|
* (and all temporary files will be zapped if we restart anyway, so no need
|
|
|
|
* for redo to do it also).
|
2008-11-19 11:34:52 +01:00
|
|
|
*
|
|
|
|
* Note that the list does not include anything scheduled for termination
|
|
|
|
* by upper-level transactions.
|
|
|
|
*/
|
|
|
|
int
|
2010-08-13 22:10:54 +02:00
|
|
|
smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
int nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
int nrels;
|
|
|
|
RelFileNode *rptr;
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
|
|
|
|
nrels = 0;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
|
|
|
|
{
|
2010-08-13 22:10:54 +02:00
|
|
|
if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
|
|
|
|
&& pending->backend == InvalidBackendId)
|
2008-11-19 11:34:52 +01:00
|
|
|
nrels++;
|
|
|
|
}
|
|
|
|
if (nrels == 0)
|
|
|
|
{
|
|
|
|
*ptr = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
|
|
|
|
*ptr = rptr;
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
|
|
|
|
{
|
2010-08-13 22:10:54 +02:00
|
|
|
if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
|
|
|
|
&& pending->backend == InvalidBackendId)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
|
|
|
*rptr = pending->relnode;
|
|
|
|
rptr++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nrels;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* PostPrepare_smgr -- Clean up after a successful PREPARE
|
|
|
|
*
|
|
|
|
* What we have to do here is throw away the in-memory state about pending
|
|
|
|
* relation deletes. It's all been recorded in the 2PC state file and
|
|
|
|
* it's no longer smgr's job to worry about it.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
PostPrepare_smgr(void)
|
|
|
|
{
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
PendingRelDelete *next;
|
|
|
|
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = next)
|
|
|
|
{
|
|
|
|
next = pending->next;
|
|
|
|
pendingDeletes = next;
|
|
|
|
/* must explicitly free the list entry */
|
|
|
|
pfree(pending);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AtSubCommit_smgr() --- Take care of subtransaction commit.
|
|
|
|
*
|
|
|
|
* Reassign all items in the pending-deletes list to the parent transaction.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
AtSubCommit_smgr(void)
|
|
|
|
{
|
|
|
|
int nestLevel = GetCurrentTransactionNestLevel();
|
|
|
|
PendingRelDelete *pending;
|
|
|
|
|
|
|
|
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
|
|
|
|
{
|
|
|
|
if (pending->nestLevel >= nestLevel)
|
|
|
|
pending->nestLevel = nestLevel - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AtSubAbort_smgr() --- Take care of subtransaction abort.
|
|
|
|
*
|
|
|
|
* Delete created relations and forget about deleted relations.
|
|
|
|
* We can execute these operations immediately because we know this
|
|
|
|
* subtransaction will not commit.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
AtSubAbort_smgr(void)
|
|
|
|
{
|
|
|
|
smgrDoPendingDeletes(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
smgr_redo(XLogReaderState *record)
|
2008-11-19 11:34:52 +01:00
|
|
|
{
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
XLogRecPtr lsn = record->EndRecPtr;
|
|
|
|
uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2009-01-20 19:59:37 +01:00
|
|
|
/* Backup blocks are not used in smgr records */
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
Assert(!XLogRecHasAnyBlockRefs(record));
|
2009-01-20 19:59:37 +01:00
|
|
|
|
2008-11-19 11:34:52 +01:00
|
|
|
if (info == XLOG_SMGR_CREATE)
|
|
|
|
{
|
|
|
|
xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
|
|
|
|
SMgrRelation reln;
|
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
reln = smgropen(xlrec->rnode, InvalidBackendId);
|
2010-12-29 12:48:53 +01:00
|
|
|
smgrcreate(reln, xlrec->forkNum, true);
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
else if (info == XLOG_SMGR_TRUNCATE)
|
|
|
|
{
|
|
|
|
xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
|
|
|
|
SMgrRelation reln;
|
2010-02-09 22:43:30 +01:00
|
|
|
Relation rel;
|
2008-11-19 11:34:52 +01:00
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
reln = smgropen(xlrec->rnode, InvalidBackendId);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Forcibly create relation if it doesn't exist (which suggests that
|
|
|
|
* it was dropped somewhere later in the WAL sequence). As in
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
2014-11-20 16:56:26 +01:00
|
|
|
* XLogReadBufferForRedo, we prefer to recreate the rel and replay the
|
|
|
|
* log as best we can until the drop is seen.
|
2008-11-19 11:34:52 +01:00
|
|
|
*/
|
|
|
|
smgrcreate(reln, MAIN_FORKNUM, true);
|
|
|
|
|
2012-12-10 14:54:42 +01:00
|
|
|
/*
|
2013-05-29 22:58:43 +02:00
|
|
|
* Before we perform the truncation, update minimum recovery point to
|
|
|
|
* cover this WAL record. Once the relation is truncated, there's no
|
|
|
|
* going back. The buffer manager enforces the WAL-first rule for
|
|
|
|
* normal updates to relation files, so that the minimum recovery
|
|
|
|
* point is always updated before the corresponding change in the data
|
|
|
|
* file is flushed to disk. We have to do the same manually here.
|
2012-12-10 14:54:42 +01:00
|
|
|
*
|
|
|
|
* Doing this before the truncation means that if the truncation fails
|
|
|
|
* for some reason, you cannot start up the system even after restart,
|
|
|
|
* until you fix the underlying situation so that the truncation will
|
|
|
|
* succeed. Alternatively, we could update the minimum recovery point
|
|
|
|
* after truncation, but that would leave a small window where the
|
|
|
|
* WAL-first rule could be violated.
|
|
|
|
*/
|
|
|
|
XLogFlush(lsn);
|
|
|
|
|
2010-08-13 22:10:54 +02:00
|
|
|
smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
|
2008-11-19 11:34:52 +01:00
|
|
|
|
|
|
|
/* Also tell xlogutils.c about it */
|
|
|
|
XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2010-02-09 22:43:30 +01:00
|
|
|
/* Truncate FSM and VM too */
|
|
|
|
rel = CreateFakeRelcacheEntry(xlrec->rnode);
|
2009-06-11 16:49:15 +02:00
|
|
|
|
2010-02-09 22:43:30 +01:00
|
|
|
if (smgrexists(reln, FSM_FORKNUM))
|
2008-11-19 11:34:52 +01:00
|
|
|
FreeSpaceMapTruncateRel(rel, xlrec->blkno);
|
2010-02-09 22:43:30 +01:00
|
|
|
if (smgrexists(reln, VISIBILITYMAP_FORKNUM))
|
|
|
|
visibilitymap_truncate(rel, xlrec->blkno);
|
|
|
|
|
|
|
|
FreeFakeRelcacheEntry(rel);
|
2008-11-19 11:34:52 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
elog(PANIC, "smgr_redo: unknown op code %u", info);
|
|
|
|
}
|