From 218f51584d5a9fcdf702bcc7f54b5b65e255c187 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Thu, 23 Mar 2017 19:38:47 +0300 Subject: [PATCH] Reduce page locking in GIN vacuum GIN vacuum during cleaning posting tree can lock this whole tree for a long time with by holding LockBufferForCleanup() on root. Patch changes it with two ways: first, cleanup lock will be taken only if there is an empty page (which should be deleted) and, second, it tries to lock only subtree, not the whole posting tree. Author: Andrey Borodin with minor editorization by me Reviewed-by: Jeff Davis, me https://commitfest.postgresql.org/13/896/ --- src/backend/access/gin/README | 15 +- src/backend/access/gin/ginbtree.c | 2 +- src/backend/access/gin/ginvacuum.c | 250 ++++++++++++++++------------- src/include/access/gin_private.h | 2 + 4 files changed, 152 insertions(+), 117 deletions(-) diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index fade0cbb61..990b5ffa58 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -314,10 +314,17 @@ deleted. The previous paragraph's reasoning only applies to searches, and only to posting trees. To protect from inserters following a downlink to a deleted page, vacuum simply locks out all concurrent insertions to the posting tree, -by holding a super-exclusive lock on the posting tree root. Inserters hold a -pin on the root page, but searches do not, so while new searches cannot begin -while root page is locked, any already-in-progress scans can continue -concurrently with vacuum. In the entry tree, we never delete pages. +by holding a super-exclusive lock on the parent page of subtree with deletable +pages. Inserters hold a pin on the root page, but searches do not, so while +new searches cannot begin while root page is locked, any already-in-progress +scans can continue concurrently with vacuum in corresponding subtree of +posting tree. To exclude interference with readers vacuum takes exclusive +locks in a depth-first scan in left-to-right order of page tuples. Leftmost +page is never deleted. Thus before deleting any page we obtain exclusive +lock on any left page, effectively excluding deadlock with any reader, despite +taking parent lock before current and left lock after current. We take left +lock not for a concurrency reasons, but rather in need to mark page dirty. +In the entry tree, we never delete pages. (This is quite different from the mechanism the btree indexam uses to make page-deletions safe; it stamps the deleted pages with an XID and keeps the diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 538ad5bb58..b02cb8ae58 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -31,7 +31,7 @@ static void ginFinishSplit(GinBtree btree, GinBtreeStack *stack, /* * Lock buffer by needed method for search. */ -static int +int ginTraverseLock(Buffer buffer, bool searchMode) { Page page; diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index c9ccfeece8..26c077a7bb 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -109,75 +109,17 @@ xlogVacuumPage(Relation index, Buffer buffer) PageSetLSN(page, recptr); } -static bool -ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer) + +typedef struct DataPageDeleteStack { - Buffer buffer; - Page page; - bool hasVoidPage = FALSE; - MemoryContext oldCxt; + struct DataPageDeleteStack *child; + struct DataPageDeleteStack *parent; - buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, - RBM_NORMAL, gvs->strategy); - page = BufferGetPage(buffer); + BlockNumber blkno; /* current block number */ + BlockNumber leftBlkno; /* rightest non-deleted page on left */ + bool isRoot; +} DataPageDeleteStack; - /* - * We should be sure that we don't concurrent with inserts, insert process - * never release root page until end (but it can unlock it and lock - * again). New scan can't start but previously started ones work - * concurrently. - */ - if (isRoot) - LockBufferForCleanup(buffer); - else - LockBuffer(buffer, GIN_EXCLUSIVE); - - Assert(GinPageIsData(page)); - - if (GinPageIsLeaf(page)) - { - oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); - ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); - MemoryContextSwitchTo(oldCxt); - MemoryContextReset(gvs->tmpCxt); - - /* if root is a leaf page, we don't desire further processing */ - if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page)) - hasVoidPage = TRUE; - } - else - { - OffsetNumber i; - bool isChildHasVoid = FALSE; - - for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++) - { - PostingItem *pitem = GinDataPageGetPostingItem(page, i); - - if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL)) - isChildHasVoid = TRUE; - } - - if (isChildHasVoid) - hasVoidPage = TRUE; - } - - /* - * if we have root and there are empty pages in tree, then we don't - * release lock to go further processing and guarantee that tree is unused - */ - if (!(isRoot && hasVoidPage)) - { - UnlockReleaseBuffer(buffer); - } - else - { - Assert(rootBuffer); - *rootBuffer = buffer; - } - - return hasVoidPage; -} /* * Delete a posting tree page. @@ -194,8 +136,13 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn BlockNumber rightlink; /* - * Lock the pages in the same order as an insertion would, to avoid - * deadlocks: left, then right, then parent. + * This function MUST be called only if someone of parent pages hold + * exclusive cleanup lock. This guarantees that no insertions currently + * happen in this subtree. Caller also acquire Exclusive lock on deletable + * page and is acquiring and releasing exclusive lock on left page before. + * Left page was locked and released. Then parent and this page are locked. + * We acquire left page lock here only to mark page dirty after changing + * right pointer. */ lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, RBM_NORMAL, gvs->strategy); @@ -205,10 +152,6 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn RBM_NORMAL, gvs->strategy); LockBuffer(lBuffer, GIN_EXCLUSIVE); - LockBuffer(dBuffer, GIN_EXCLUSIVE); - if (!isParentRoot) /* parent is already locked by - * LockBufferForCleanup() */ - LockBuffer(pBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); @@ -272,26 +215,15 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn PageSetLSN(BufferGetPage(lBuffer), recptr); } - if (!isParentRoot) - LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); UnlockReleaseBuffer(lBuffer); - UnlockReleaseBuffer(dBuffer); + ReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; } -typedef struct DataPageDeleteStack -{ - struct DataPageDeleteStack *child; - struct DataPageDeleteStack *parent; - - BlockNumber blkno; /* current block number */ - BlockNumber leftBlkno; /* rightest non-deleted page on left */ - bool isRoot; -} DataPageDeleteStack; /* * scans posting tree and deletes empty pages @@ -325,6 +257,10 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); + + if(!isRoot) + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); Assert(GinPageIsData(page)); @@ -359,6 +295,9 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, } } + if(!isRoot) + LockBuffer(buffer, GIN_UNLOCK); + ReleaseBuffer(buffer); if (!meDelete) @@ -367,37 +306,124 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, return meDelete; } + +/* + * Scan through posting tree, delete empty tuples from leaf pages. + * Also, this function collects empty subtrees (with all empty leafs). + * For parents of these subtrees CleanUp lock is taken, then we call + * ScanToDelete. This is done for every inner page, which points to + * empty subtree. + */ +static bool +ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot) +{ + Buffer buffer; + Page page; + bool hasVoidPage = FALSE; + MemoryContext oldCxt; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + page = BufferGetPage(buffer); + + ginTraverseLock(buffer,false); + + Assert(GinPageIsData(page)); + + if (GinPageIsLeaf(page)) + { + oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); + ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(gvs->tmpCxt); + + /* if root is a leaf page, we don't desire further processing */ + if (GinDataLeafPageIsEmpty(page)) + hasVoidPage = TRUE; + + UnlockReleaseBuffer(buffer); + + return hasVoidPage; + } + else + { + OffsetNumber i; + bool hasEmptyChild = FALSE; + bool hasNonEmptyChild = FALSE; + OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; + BlockNumber* children = palloc(sizeof(BlockNumber) * (maxoff + 1)); + + /* + * Read all children BlockNumbers. + * Not sure it is safe if there are many concurrent vacuums. + */ + + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + PostingItem *pitem = GinDataPageGetPostingItem(page, i); + + children[i] = PostingItemGetBlockNumber(pitem); + } + + UnlockReleaseBuffer(buffer); + + for (i = FirstOffsetNumber; i <= maxoff; i++) + { + if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE)) + hasEmptyChild = TRUE; + else + hasNonEmptyChild = TRUE; + } + + pfree(children); + + vacuum_delay_point(); + + /* + * All subtree is empty - just return TRUE to indicate that parent must + * do a cleanup. Unless we are ROOT an there is way to go upper. + */ + + if(hasEmptyChild && !hasNonEmptyChild && !isRoot) + return TRUE; + + if(hasEmptyChild) + { + DataPageDeleteStack root, + *ptr, + *tmp; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, + RBM_NORMAL, gvs->strategy); + LockBufferForCleanup(buffer); + + memset(&root, 0, sizeof(DataPageDeleteStack)); + root.leftBlkno = InvalidBlockNumber; + root.isRoot = TRUE; + + ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber); + + ptr = root.child; + + while (ptr) + { + tmp = ptr->child; + pfree(ptr); + ptr = tmp; + } + + UnlockReleaseBuffer(buffer); + } + + /* Here we have deleted all empty subtrees */ + return FALSE; + } +} + static void ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) { - Buffer rootBuffer = InvalidBuffer; - DataPageDeleteStack root, - *ptr, - *tmp; - - if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE) - { - Assert(rootBuffer == InvalidBuffer); - return; - } - - memset(&root, 0, sizeof(DataPageDeleteStack)); - root.leftBlkno = InvalidBlockNumber; - root.isRoot = TRUE; - - vacuum_delay_point(); - - ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber); - - ptr = root.child; - while (ptr) - { - tmp = ptr->child; - pfree(ptr); - ptr = tmp; - } - - UnlockReleaseBuffer(rootBuffer); + ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE); } /* diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 34e7339f05..824cc1c9d8 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -471,4 +471,6 @@ ginCompareItemPointers(ItemPointer a, ItemPointer b) return -1; } +extern int ginTraverseLock(Buffer buffer, bool searchMode); + #endif /* GIN_PRIVATE_H */