Prevent multiple cleanup process for pending list in GIN.

Previously, ginInsertCleanup could exit early if it detects that someone else
is cleaning up the pending list, without waiting for that someone else to
finish the job. But in this case vacuum could miss tuples to be deleted.

Cleanup process now locks metapage with a help of heavyweight
LockPage(ExclusiveLock), and it guarantees that there is no another cleanup
process at the same time. Lock is taken differently depending on caller of
cleanup process: any vacuums and gin_clean_pending_list() will be blocked
until lock becomes available, ordinary insert uses conditional lock to
prevent indefinite waiting on lock.

Insert into pending list doesn't use this lock, so insertion isn't blocked.

Also, patch adds stopping of cleanup process when at-start-cleanup-tail is
reached in order to prevent infinite cleanup in case of massive insertion. But
it will stop only for automatic maintenance tasks like autovacuum.

Patch introduces choice of limit of memory to use: autovacuum_work_mem,
maintenance_work_mem or work_mem depending on call path.

Patch for previous releases should be reworked due to changes between 9.6 and
previous ones in this area.

Discover and diagnostics by Jeff Janes and Tomas Vondra

Patch by me with some ideas of Jeff Janes
This commit is contained in:
Teodor Sigaev 2016-04-28 16:21:42 +03:00
parent ad520ec4ac
commit e2c79e14d9
3 changed files with 77 additions and 62 deletions

View File

@ -27,7 +27,9 @@
#include "utils/memutils.h" #include "utils/memutils.h"
#include "utils/rel.h" #include "utils/rel.h"
#include "utils/acl.h" #include "utils/acl.h"
#include "postmaster/autovacuum.h"
#include "storage/indexfsm.h" #include "storage/indexfsm.h"
#include "storage/lmgr.h"
/* GUC parameter */ /* GUC parameter */
int gin_pending_list_limit = 0; int gin_pending_list_limit = 0;
@ -437,7 +439,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
END_CRIT_SECTION(); END_CRIT_SECTION();
if (needCleanup) if (needCleanup)
ginInsertCleanup(ginstate, true, NULL); ginInsertCleanup(ginstate, false, true, NULL);
} }
/* /*
@ -502,11 +504,8 @@ ginHeapTupleFastCollect(GinState *ginstate,
* If newHead == InvalidBlockNumber then function drops the whole list. * If newHead == InvalidBlockNumber then function drops the whole list.
* *
* metapage is pinned and exclusive-locked throughout this function. * metapage is pinned and exclusive-locked throughout this function.
*
* Returns true if another cleanup process is running concurrently
* (if so, we can just abandon our own efforts)
*/ */
static bool static void
shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
bool fill_fsm, IndexBulkDeleteResult *stats) bool fill_fsm, IndexBulkDeleteResult *stats)
{ {
@ -537,14 +536,7 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
data.ndeleted++; data.ndeleted++;
if (GinPageIsDeleted(page)) Assert(!GinPageIsDeleted(page));
{
/* concurrent cleanup process is detected */
for (i = 0; i < data.ndeleted; i++)
UnlockReleaseBuffer(buffers[i]);
return true;
}
nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff;
blknoToDelete = GinPageGetOpaque(page)->rightlink; blknoToDelete = GinPageGetOpaque(page)->rightlink;
@ -620,8 +612,6 @@ shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
RecordFreeIndexPage(index, freespace[i]); RecordFreeIndexPage(index, freespace[i]);
} while (blknoToDelete != newHead); } while (blknoToDelete != newHead);
return false;
} }
/* Initialize empty KeyArray */ /* Initialize empty KeyArray */
@ -722,18 +712,10 @@ processPendingPage(BuildAccumulator *accum, KeyArray *ka,
/* /*
* Move tuples from pending pages into regular GIN structure. * Move tuples from pending pages into regular GIN structure.
* *
* This can be called concurrently by multiple backends, so it must cope. * On first glance it looks completely not crash-safe. But if we crash
* On first glance it looks completely not concurrent-safe and not crash-safe * after posting entries to the main index and before removing them from the
* either. The reason it's okay is that multiple insertion of the same entry
* is detected and treated as a no-op by gininsert.c. If we crash after
* posting entries to the main index and before removing them from the
* pending list, it's okay because when we redo the posting later on, nothing * pending list, it's okay because when we redo the posting later on, nothing
* bad will happen. Likewise, if two backends simultaneously try to post * bad will happen.
* a pending entry into the main index, one will succeed and one will do
* nothing. We try to notice when someone else is a little bit ahead of
* us in the process, but that's just to avoid wasting cycles. Only the
* action of removing a page from the pending list really needs exclusive
* lock.
* *
* fill_fsm indicates that ginInsertCleanup should add deleted pages * fill_fsm indicates that ginInsertCleanup should add deleted pages
* to FSM otherwise caller is responsible to put deleted pages into * to FSM otherwise caller is responsible to put deleted pages into
@ -742,7 +724,7 @@ processPendingPage(BuildAccumulator *accum, KeyArray *ka,
* If stats isn't null, we count deleted pending pages into the counts. * If stats isn't null, we count deleted pending pages into the counts.
*/ */
void void
ginInsertCleanup(GinState *ginstate, ginInsertCleanup(GinState *ginstate, bool full_clean,
bool fill_fsm, IndexBulkDeleteResult *stats) bool fill_fsm, IndexBulkDeleteResult *stats)
{ {
Relation index = ginstate->index; Relation index = ginstate->index;
@ -755,8 +737,43 @@ ginInsertCleanup(GinState *ginstate,
oldCtx; oldCtx;
BuildAccumulator accum; BuildAccumulator accum;
KeyArray datums; KeyArray datums;
BlockNumber blkno; BlockNumber blkno,
blknoFinish;
bool cleanupFinish = false;
bool fsm_vac = false; bool fsm_vac = false;
Size workMemory;
bool inVacuum = (stats == NULL);
/*
* We would like to prevent concurrent cleanup process. For
* that we will lock metapage in exclusive mode using LockPage()
* call. Nobody other will use that lock for metapage, so
* we keep possibility of concurrent insertion into pending list
*/
if (inVacuum)
{
/*
* We are called from [auto]vacuum/analyze or
* gin_clean_pending_list() and we would like to wait
* concurrent cleanup to finish.
*/
LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
workMemory =
(IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ?
autovacuum_work_mem : maintenance_work_mem;
}
else
{
/*
* We are called from regular insert and if we see
* concurrent cleanup just exit in hope that concurrent
* process will clean up pending list.
*/
if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock))
return;
workMemory = work_mem;
}
metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
LockBuffer(metabuffer, GIN_SHARE); LockBuffer(metabuffer, GIN_SHARE);
@ -767,9 +784,16 @@ ginInsertCleanup(GinState *ginstate,
{ {
/* Nothing to do */ /* Nothing to do */
UnlockReleaseBuffer(metabuffer); UnlockReleaseBuffer(metabuffer);
UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
return; return;
} }
/*
* Remember a tail page to prevent infinite cleanup if other backends add
* new tuples faster than we can cleanup.
*/
blknoFinish = metadata->tail;
/* /*
* Read and lock head of pending list * Read and lock head of pending list
*/ */
@ -802,13 +826,15 @@ ginInsertCleanup(GinState *ginstate,
*/ */
for (;;) for (;;)
{ {
if (GinPageIsDeleted(page)) Assert(!GinPageIsDeleted(page));
{
/* another cleanup process is running concurrently */ /*
UnlockReleaseBuffer(buffer); * Are we walk through the page which as we remember was a tail when we
fsm_vac = false; * start our cleanup? But if caller asks us to clean up whole pending
break; * list then ignore old tail, we will work until list becomes empty.
} */
if (blkno == blknoFinish && full_clean == false)
cleanupFinish = true;
/* /*
* read page's datums into accum * read page's datums into accum
@ -821,13 +847,10 @@ ginInsertCleanup(GinState *ginstate,
* Is it time to flush memory to disk? Flush if we are at the end of * Is it time to flush memory to disk? Flush if we are at the end of
* the pending list, or if we have a full row and memory is getting * the pending list, or if we have a full row and memory is getting
* full. * full.
*
* XXX using up maintenance_work_mem here is probably unreasonably
* much, since vacuum might already be using that much.
*/ */
if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber ||
(GinPageHasFullRow(page) && (GinPageHasFullRow(page) &&
(accum.allocatedMemory >= (Size)maintenance_work_mem * 1024L))) (accum.allocatedMemory >= workMemory * 1024L)))
{ {
ItemPointerData *list; ItemPointerData *list;
uint32 nlist; uint32 nlist;
@ -864,14 +887,7 @@ ginInsertCleanup(GinState *ginstate,
LockBuffer(metabuffer, GIN_EXCLUSIVE); LockBuffer(metabuffer, GIN_EXCLUSIVE);
LockBuffer(buffer, GIN_SHARE); LockBuffer(buffer, GIN_SHARE);
if (GinPageIsDeleted(page)) Assert(!GinPageIsDeleted(page));
{
/* another cleanup process is running concurrently */
UnlockReleaseBuffer(buffer);
LockBuffer(metabuffer, GIN_UNLOCK);
fsm_vac = false;
break;
}
/* /*
* While we left the page unlocked, more stuff might have gotten * While we left the page unlocked, more stuff might have gotten
@ -904,13 +920,7 @@ ginInsertCleanup(GinState *ginstate,
* remove read pages from pending list, at this point all * remove read pages from pending list, at this point all
* content of read pages is in regular structure * content of read pages is in regular structure
*/ */
if (shiftList(index, metabuffer, blkno, fill_fsm, stats)) shiftList(index, metabuffer, blkno, fill_fsm, stats);
{
/* another cleanup process is running concurrently */
LockBuffer(metabuffer, GIN_UNLOCK);
fsm_vac = false;
break;
}
/* At this point, some pending pages have been freed up */ /* At this point, some pending pages have been freed up */
fsm_vac = true; fsm_vac = true;
@ -919,9 +929,10 @@ ginInsertCleanup(GinState *ginstate,
LockBuffer(metabuffer, GIN_UNLOCK); LockBuffer(metabuffer, GIN_UNLOCK);
/* /*
* if we removed the whole pending list just exit * if we removed the whole pending list or we cleanup tail (which
* we remembered on start our cleanup process) then just exit
*/ */
if (blkno == InvalidBlockNumber) if (blkno == InvalidBlockNumber || cleanupFinish)
break; break;
/* /*
@ -946,6 +957,7 @@ ginInsertCleanup(GinState *ginstate,
page = BufferGetPage(buffer); page = BufferGetPage(buffer);
} }
UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
ReleaseBuffer(metabuffer); ReleaseBuffer(metabuffer);
/* /*
@ -1004,7 +1016,7 @@ gin_clean_pending_list(PG_FUNCTION_ARGS)
memset(&stats, 0, sizeof(stats)); memset(&stats, 0, sizeof(stats));
initGinState(&ginstate, indexRel); initGinState(&ginstate, indexRel);
ginInsertCleanup(&ginstate, true, &stats); ginInsertCleanup(&ginstate, true, true, &stats);
index_close(indexRel, AccessShareLock); index_close(indexRel, AccessShareLock);

View File

@ -540,8 +540,10 @@ ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
{ {
/* Yes, so initialize stats to zeroes */ /* Yes, so initialize stats to zeroes */
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
/* and cleanup any pending inserts */ /*
ginInsertCleanup(&gvs.ginstate, false, stats); * and cleanup any pending inserts */
ginInsertCleanup(&gvs.ginstate, !IsAutoVacuumWorkerProcess(),
false, stats);
} }
/* we'll re-count the tuples each time */ /* we'll re-count the tuples each time */
@ -654,7 +656,7 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
if (IsAutoVacuumWorkerProcess()) if (IsAutoVacuumWorkerProcess())
{ {
initGinState(&ginstate, index); initGinState(&ginstate, index);
ginInsertCleanup(&ginstate, true, stats); ginInsertCleanup(&ginstate, false, true, stats);
} }
return stats; return stats;
} }
@ -667,7 +669,8 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{ {
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
initGinState(&ginstate, index); initGinState(&ginstate, index);
ginInsertCleanup(&ginstate, false, stats); ginInsertCleanup(&ginstate, !IsAutoVacuumWorkerProcess(),
false, stats);
} }
memset(&idxStat, 0, sizeof(idxStat)); memset(&idxStat, 0, sizeof(idxStat));

View File

@ -947,7 +947,7 @@ extern void ginHeapTupleFastCollect(GinState *ginstate,
GinTupleCollector *collector, GinTupleCollector *collector,
OffsetNumber attnum, Datum value, bool isNull, OffsetNumber attnum, Datum value, bool isNull,
ItemPointer ht_ctid); ItemPointer ht_ctid);
extern void ginInsertCleanup(GinState *ginstate, extern void ginInsertCleanup(GinState *ginstate, bool full_clean,
bool fill_fsm, IndexBulkDeleteResult *stats); bool fill_fsm, IndexBulkDeleteResult *stats);
/* ginpostinglist.c */ /* ginpostinglist.c */