Reduce page locking in GIN vacuum

GIN vacuum during cleaning posting tree can lock this whole tree for a long
time with by holding LockBufferForCleanup() on root. Patch changes it with
two ways: first, cleanup lock will be taken only if there is an empty page
(which should be deleted) and, second, it tries to lock only subtree, not the
whole posting tree.

Author: Andrey Borodin with minor editorization by me
Reviewed-by: Jeff Davis, me

https://commitfest.postgresql.org/13/896/
This commit is contained in:
Teodor Sigaev 2017-03-23 19:38:47 +03:00
parent 73561013e5
commit 218f51584d
4 changed files with 152 additions and 117 deletions

View File

@ -314,10 +314,17 @@ deleted.
The previous paragraph's reasoning only applies to searches, and only to
posting trees. To protect from inserters following a downlink to a deleted
page, vacuum simply locks out all concurrent insertions to the posting tree,
by holding a super-exclusive lock on the posting tree root. Inserters hold a
pin on the root page, but searches do not, so while new searches cannot begin
while root page is locked, any already-in-progress scans can continue
concurrently with vacuum. In the entry tree, we never delete pages.
by holding a super-exclusive lock on the parent page of subtree with deletable
pages. Inserters hold a pin on the root page, but searches do not, so while
new searches cannot begin while root page is locked, any already-in-progress
scans can continue concurrently with vacuum in corresponding subtree of
posting tree. To exclude interference with readers vacuum takes exclusive
locks in a depth-first scan in left-to-right order of page tuples. Leftmost
page is never deleted. Thus before deleting any page we obtain exclusive
lock on any left page, effectively excluding deadlock with any reader, despite
taking parent lock before current and left lock after current. We take left
lock not for a concurrency reasons, but rather in need to mark page dirty.
In the entry tree, we never delete pages.
(This is quite different from the mechanism the btree indexam uses to make
page-deletions safe; it stamps the deleted pages with an XID and keeps the

View File

@ -31,7 +31,7 @@ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack,
/*
* Lock buffer by needed method for search.
*/
static int
int
ginTraverseLock(Buffer buffer, bool searchMode)
{
Page page;

View File

@ -109,75 +109,17 @@ xlogVacuumPage(Relation index, Buffer buffer)
PageSetLSN(page, recptr);
}
static bool
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer)
typedef struct DataPageDeleteStack
{
Buffer buffer;
Page page;
bool hasVoidPage = FALSE;
MemoryContext oldCxt;
struct DataPageDeleteStack *child;
struct DataPageDeleteStack *parent;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
page = BufferGetPage(buffer);
BlockNumber blkno; /* current block number */
BlockNumber leftBlkno; /* rightest non-deleted page on left */
bool isRoot;
} DataPageDeleteStack;
/*
* We should be sure that we don't concurrent with inserts, insert process
* never release root page until end (but it can unlock it and lock
* again). New scan can't start but previously started ones work
* concurrently.
*/
if (isRoot)
LockBufferForCleanup(buffer);
else
LockBuffer(buffer, GIN_EXCLUSIVE);
Assert(GinPageIsData(page));
if (GinPageIsLeaf(page))
{
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
MemoryContextSwitchTo(oldCxt);
MemoryContextReset(gvs->tmpCxt);
/* if root is a leaf page, we don't desire further processing */
if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page))
hasVoidPage = TRUE;
}
else
{
OffsetNumber i;
bool isChildHasVoid = FALSE;
for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++)
{
PostingItem *pitem = GinDataPageGetPostingItem(page, i);
if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL))
isChildHasVoid = TRUE;
}
if (isChildHasVoid)
hasVoidPage = TRUE;
}
/*
* if we have root and there are empty pages in tree, then we don't
* release lock to go further processing and guarantee that tree is unused
*/
if (!(isRoot && hasVoidPage))
{
UnlockReleaseBuffer(buffer);
}
else
{
Assert(rootBuffer);
*rootBuffer = buffer;
}
return hasVoidPage;
}
/*
* Delete a posting tree page.
@ -194,8 +136,13 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
BlockNumber rightlink;
/*
* Lock the pages in the same order as an insertion would, to avoid
* deadlocks: left, then right, then parent.
* This function MUST be called only if someone of parent pages hold
* exclusive cleanup lock. This guarantees that no insertions currently
* happen in this subtree. Caller also acquire Exclusive lock on deletable
* page and is acquiring and releasing exclusive lock on left page before.
* Left page was locked and released. Then parent and this page are locked.
* We acquire left page lock here only to mark page dirty after changing
* right pointer.
*/
lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno,
RBM_NORMAL, gvs->strategy);
@ -205,10 +152,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
RBM_NORMAL, gvs->strategy);
LockBuffer(lBuffer, GIN_EXCLUSIVE);
LockBuffer(dBuffer, GIN_EXCLUSIVE);
if (!isParentRoot) /* parent is already locked by
* LockBufferForCleanup() */
LockBuffer(pBuffer, GIN_EXCLUSIVE);
START_CRIT_SECTION();
@ -272,26 +215,15 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
PageSetLSN(BufferGetPage(lBuffer), recptr);
}
if (!isParentRoot)
LockBuffer(pBuffer, GIN_UNLOCK);
ReleaseBuffer(pBuffer);
UnlockReleaseBuffer(lBuffer);
UnlockReleaseBuffer(dBuffer);
ReleaseBuffer(dBuffer);
END_CRIT_SECTION();
gvs->result->pages_deleted++;
}
typedef struct DataPageDeleteStack
{
struct DataPageDeleteStack *child;
struct DataPageDeleteStack *parent;
BlockNumber blkno; /* current block number */
BlockNumber leftBlkno; /* rightest non-deleted page on left */
bool isRoot;
} DataPageDeleteStack;
/*
* scans posting tree and deletes empty pages
@ -325,6 +257,10 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
if(!isRoot)
LockBuffer(buffer, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
Assert(GinPageIsData(page));
@ -359,6 +295,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
}
}
if(!isRoot)
LockBuffer(buffer, GIN_UNLOCK);
ReleaseBuffer(buffer);
if (!meDelete)
@ -367,37 +306,124 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
return meDelete;
}
/*
* Scan through posting tree, delete empty tuples from leaf pages.
* Also, this function collects empty subtrees (with all empty leafs).
* For parents of these subtrees CleanUp lock is taken, then we call
* ScanToDelete. This is done for every inner page, which points to
* empty subtree.
*/
static bool
ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot)
{
Buffer buffer;
Page page;
bool hasVoidPage = FALSE;
MemoryContext oldCxt;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
page = BufferGetPage(buffer);
ginTraverseLock(buffer,false);
Assert(GinPageIsData(page));
if (GinPageIsLeaf(page))
{
oldCxt = MemoryContextSwitchTo(gvs->tmpCxt);
ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs);
MemoryContextSwitchTo(oldCxt);
MemoryContextReset(gvs->tmpCxt);
/* if root is a leaf page, we don't desire further processing */
if (GinDataLeafPageIsEmpty(page))
hasVoidPage = TRUE;
UnlockReleaseBuffer(buffer);
return hasVoidPage;
}
else
{
OffsetNumber i;
bool hasEmptyChild = FALSE;
bool hasNonEmptyChild = FALSE;
OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff;
BlockNumber* children = palloc(sizeof(BlockNumber) * (maxoff + 1));
/*
* Read all children BlockNumbers.
* Not sure it is safe if there are many concurrent vacuums.
*/
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
PostingItem *pitem = GinDataPageGetPostingItem(page, i);
children[i] = PostingItemGetBlockNumber(pitem);
}
UnlockReleaseBuffer(buffer);
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE))
hasEmptyChild = TRUE;
else
hasNonEmptyChild = TRUE;
}
pfree(children);
vacuum_delay_point();
/*
* All subtree is empty - just return TRUE to indicate that parent must
* do a cleanup. Unless we are ROOT an there is way to go upper.
*/
if(hasEmptyChild && !hasNonEmptyChild && !isRoot)
return TRUE;
if(hasEmptyChild)
{
DataPageDeleteStack root,
*ptr,
*tmp;
buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno,
RBM_NORMAL, gvs->strategy);
LockBufferForCleanup(buffer);
memset(&root, 0, sizeof(DataPageDeleteStack));
root.leftBlkno = InvalidBlockNumber;
root.isRoot = TRUE;
ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber);
ptr = root.child;
while (ptr)
{
tmp = ptr->child;
pfree(ptr);
ptr = tmp;
}
UnlockReleaseBuffer(buffer);
}
/* Here we have deleted all empty subtrees */
return FALSE;
}
}
static void
ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno)
{
Buffer rootBuffer = InvalidBuffer;
DataPageDeleteStack root,
*ptr,
*tmp;
if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE)
{
Assert(rootBuffer == InvalidBuffer);
return;
}
memset(&root, 0, sizeof(DataPageDeleteStack));
root.leftBlkno = InvalidBlockNumber;
root.isRoot = TRUE;
vacuum_delay_point();
ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber);
ptr = root.child;
while (ptr)
{
tmp = ptr->child;
pfree(ptr);
ptr = tmp;
}
UnlockReleaseBuffer(rootBuffer);
ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE);
}
/*

View File

@ -471,4 +471,6 @@ ginCompareItemPointers(ItemPointer a, ItemPointer b)
return -1;
}
extern int ginTraverseLock(Buffer buffer, bool searchMode);
#endif /* GIN_PRIVATE_H */