Accelerate end-of-transaction dropping of relations

When relations are dropped, at end of transaction we need to remove the
files and clean the buffer pool of buffers containing pages of those
relations.  Previously we would scan the buffer pool once per relation
to clean up buffers.  When there are many relations to drop, the
repeated scans make this process slow; so we now instead pass a list of
relations to drop and scan the pool once, checking each buffer against
the passed list.  When the number of relations is larger than a
threshold (which as of this patch is being set to 20 relations) we sort
the array before starting, and bsearch the array; when it's smaller, we
simply scan the array linearly each time, because that's faster.  The
exact optimal threshold value depends on many factors, but the
difference is not likely to be significant enough to justify making it
user-settable.

This has been measured to be a significant win (a 15x win when dropping
100,000 relations; an extreme case, but reportedly a real one).

Author: Tomas Vondra, some tweaks by me
Reviewed by: Robert Haas, Shigeru Hanada, Andres Freund, Álvaro Herrera
This commit is contained in:
Alvaro Herrera 2013-01-17 15:55:10 -03:00
parent 0b6329130e
commit 279628a0a7
5 changed files with 206 additions and 14 deletions

View File

@ -312,6 +312,10 @@ smgrDoPendingDeletes(bool isCommit)
PendingRelDelete *pending;
PendingRelDelete *prev;
PendingRelDelete *next;
int nrels = 0,
i = 0,
maxrels = 8;
SMgrRelation *srels = palloc(maxrels * sizeof(SMgrRelation));
prev = NULL;
for (pending = pendingDeletes; pending != NULL; pending = next)
@ -335,14 +339,32 @@ smgrDoPendingDeletes(bool isCommit)
SMgrRelation srel;
srel = smgropen(pending->relnode, pending->backend);
smgrdounlink(srel, false);
smgrclose(srel);
/* extend the array if needed (double the size) */
if (maxrels <= nrels)
{
maxrels *= 2;
srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
}
srels[nrels++] = srel;
}
/* must explicitly free the list entry */
pfree(pending);
/* prev does not change */
}
}
if (nrels > 0)
{
smgrdounlinkall(srels, nrels, false);
for (i = 0; i < nrels; i++)
smgrclose(srels[i]);
}
pfree(srels);
}
/*

View File

@ -62,6 +62,7 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
#define DROP_RELS_BSEARCH_THRESHOLD 20
/* GUC variables */
bool zero_damaged_pages = false;
@ -107,6 +108,7 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
bool *foundPtr);
static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
static int rnode_comparator(const void *p1, const void *p2);
/*
@ -2086,43 +2088,103 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
}
/* ---------------------------------------------------------------------
* DropRelFileNodeAllBuffers
* DropRelFileNodesAllBuffers
*
* This function removes from the buffer pool all the pages of all
* forks of the specified relation. It's equivalent to calling
* DropRelFileNodeBuffers once per fork with firstDelBlock = 0.
* forks of the specified relations. It's equivalent to calling
* DropRelFileNodeBuffers once per fork per relation with
* firstDelBlock = 0.
* --------------------------------------------------------------------
*/
void
DropRelFileNodeAllBuffers(RelFileNodeBackend rnode)
DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
int i;
int i,
n = 0;
RelFileNode *nodes;
bool use_bsearch;
if (nnodes == 0)
return;
nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
/* If it's a local relation, it's localbuf.c's problem. */
if (RelFileNodeBackendIsTemp(rnode))
for (i = 0; i < nnodes; i++)
{
if (rnode.backend == MyBackendId)
DropRelFileNodeAllLocalBuffers(rnode.node);
if (RelFileNodeBackendIsTemp(rnodes[i]))
{
if (rnodes[i].backend == MyBackendId)
DropRelFileNodeAllLocalBuffers(rnodes[i].node);
}
else
nodes[n++] = rnodes[i].node;
}
/*
* If there are no non-local relations, then we're done. Release the memory
* and return.
*/
if (n == 0)
{
pfree(nodes);
return;
}
/*
* For low number of relations to drop just use a simple walk through, to
* save the bsearch overhead. The threshold to use is rather a guess than a
* exactly determined value, as it depends on many factors (CPU and RAM
* speeds, amount of shared buffers etc.).
*/
use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
/* sort the list of rnodes if necessary */
if (use_bsearch)
pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
for (i = 0; i < NBuffers; i++)
{
RelFileNode *rnode = NULL;
volatile BufferDesc *bufHdr = &BufferDescriptors[i];
/*
* As in DropRelFileNodeBuffers, an unlocked precheck should be safe
* and saves some cycles.
*/
if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
if (!use_bsearch)
{
int j;
for (j = 0; j < n; j++)
{
if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
{
rnode = &nodes[j];
break;
}
}
}
else
{
rnode = bsearch((const void *) &(bufHdr->tag.rnode),
nodes, n, sizeof(RelFileNode),
rnode_comparator);
}
/* buffer doesn't belong to any of the given relfilenodes; skip it */
if (rnode == NULL)
continue;
LockBufHdr(bufHdr);
if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
InvalidateBuffer(bufHdr); /* releases spinlock */
else
UnlockBufHdr(bufHdr);
}
pfree(nodes);
}
/* ---------------------------------------------------------------------
@ -2953,3 +3015,30 @@ local_buffer_write_error_callback(void *arg)
pfree(path);
}
}
/*
* RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
*/
static int
rnode_comparator(const void *p1, const void *p2)
{
RelFileNode n1 = *(RelFileNode *) p1;
RelFileNode n2 = *(RelFileNode *) p2;
if (n1.relNode < n2.relNode)
return -1;
else if (n1.relNode > n2.relNode)
return 1;
if (n1.dbNode < n2.dbNode)
return -1;
else if (n1.dbNode > n2.dbNode)
return 1;
if (n1.spcNode < n2.spcNode)
return -1;
else if (n1.spcNode > n2.spcNode)
return 1;
else
return 0;
}

View File

@ -390,7 +390,7 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
* Get rid of any remaining buffers for the relation. bufmgr will just
* drop them without bothering to write the contents.
*/
DropRelFileNodeAllBuffers(rnode);
DropRelFileNodesAllBuffers(&rnode, 1);
/*
* It'd be nice to tell the stats collector to forget it immediately, too.
@ -419,6 +419,86 @@ smgrdounlink(SMgrRelation reln, bool isRedo)
(*(smgrsw[which].smgr_unlink)) (rnode, InvalidForkNumber, isRedo);
}
/*
* smgrdounlinkall() -- Immediately unlink all forks of all given relations
*
* All forks of all given relations are removed from the store. This
* should not be used during transactional operations, since it can't be
* undone.
*
* If isRedo is true, it is okay for the underlying file(s) to be gone
* already.
*
* This is equivalent to calling smgrdounlink for each relation, but it's
* significantly quicker so should be preferred when possible.
*/
void
smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
{
int i = 0;
RelFileNodeBackend *rnodes;
ForkNumber forknum;
if (nrels == 0)
return;
/*
* create an array which contains all relations to be dropped, and
* close each relation's forks at the smgr level while at it
*/
rnodes = palloc(sizeof(RelFileNodeBackend) * nrels);
for (i = 0; i < nrels; i++)
{
RelFileNodeBackend rnode = rels[i]->smgr_rnode;
int which = rels[i]->smgr_which;
rnodes[i] = rnode;
/* Close the forks at smgr level */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
(*(smgrsw[which].smgr_close)) (rels[i], forknum);
}
/*
* Get rid of any remaining buffers for the relations. bufmgr will just
* drop them without bothering to write the contents.
*/
DropRelFileNodesAllBuffers(rnodes, nrels);
/*
* It'd be nice to tell the stats collector to forget them immediately, too.
* But we can't because we don't know the OIDs.
*/
/*
* Send a shared-inval message to force other backends to close any
* dangling smgr references they may have for these rels. We should do
* this before starting the actual unlinking, in case we fail partway
* through that step. Note that the sinval messages will eventually come
* back to this backend, too, and thereby provide a backstop that we closed
* our own smgr rel.
*/
for (i = 0; i < nrels; i++)
CacheInvalidateSmgr(rnodes[i]);
/*
* Delete the physical file(s).
*
* Note: smgr_unlink must treat deletion failure as a WARNING, not an
* ERROR, because we've already decided to commit or abort the current
* xact.
*/
for (i = 0; i < nrels; i++)
{
int which = rels[i]->smgr_which;
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
(*(smgrsw[which].smgr_unlink)) (rnodes[i], forknum, isRedo);
}
pfree(rnodes);
}
/*
* smgrdounlinkfork() -- Immediately unlink one fork of a relation.
*

View File

@ -188,7 +188,7 @@ extern void FlushRelationBuffers(Relation rel);
extern void FlushDatabaseBuffers(Oid dbid);
extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
ForkNumber forkNum, BlockNumber firstDelBlock);
extern void DropRelFileNodeAllBuffers(RelFileNodeBackend rnode);
extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
extern void DropDatabaseBuffers(Oid dbid);
#define RelationGetNumberOfBlocks(reln) \

View File

@ -85,6 +85,7 @@ extern void smgrcloseall(void);
extern void smgrclosenode(RelFileNodeBackend rnode);
extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrdounlink(SMgrRelation reln, bool isRedo);
extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
extern void smgrdounlinkfork(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);