Avoid memory leaks when a GatherMerge node is rescanned.

Rescanning a GatherMerge led to leaking some memory in the executor's
query-lifespan context, because most of the node's working data structures
were simply abandoned and rebuilt from scratch.  In practice, this might
never amount to much, given the cost of relaunching worker processes ---
but it's still pretty messy, so let's fix it.

We can rearrange things so that the tuple arrays are simply cleared and
reused, and we don't need to rebuild the TupleTableSlots either, just
clear them.  One small complication is that because we might get a
different number of workers on each iteration, we can't keep the old
convention that the leader's gm_slots[] entry is the last one; the leader
might clobber a TupleTableSlot that we need for a worker in a future
iteration.  Hence, adjust the logic so that the leader has slot 0 always,
while the active workers have slots 1..n.

Back-patch to v10 to keep all the existing versions of nodeGatherMerge.c
in sync --- because of the renumbering of the slots, there would otherwise
be a very large risk that any future backpatches in this module would
introduce bugs.

Discussion: https://postgr.es/m/8670.1504192177@sss.pgh.pa.us
This commit is contained in:
Tom Lane 2017-08-31 16:20:58 -04:00
parent 30833ba154
commit 2d44c58c79
2 changed files with 114 additions and 62 deletions

View File

@ -55,8 +55,10 @@ static int32 heap_compare_slots(Datum a, Datum b, void *arg);
static TupleTableSlot *gather_merge_getnext(GatherMergeState *gm_state); static TupleTableSlot *gather_merge_getnext(GatherMergeState *gm_state);
static HeapTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader, static HeapTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader,
bool nowait, bool *done); bool nowait, bool *done);
static void gather_merge_init(GatherMergeState *gm_state);
static void ExecShutdownGatherMergeWorkers(GatherMergeState *node); static void ExecShutdownGatherMergeWorkers(GatherMergeState *node);
static void gather_merge_setup(GatherMergeState *gm_state);
static void gather_merge_init(GatherMergeState *gm_state);
static void gather_merge_clear_tuples(GatherMergeState *gm_state);
static bool gather_merge_readnext(GatherMergeState *gm_state, int reader, static bool gather_merge_readnext(GatherMergeState *gm_state, int reader,
bool nowait); bool nowait);
static void load_tuple_array(GatherMergeState *gm_state, int reader); static void load_tuple_array(GatherMergeState *gm_state, int reader);
@ -149,14 +151,17 @@ ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags)
} }
/* /*
* store the tuple descriptor into gather merge state, so we can use it * Store the tuple descriptor into gather merge state, so we can use it
* later while initializing the gather merge slots. * while initializing the gather merge slots.
*/ */
if (!ExecContextForcesOids(&gm_state->ps, &hasoid)) if (!ExecContextForcesOids(&gm_state->ps, &hasoid))
hasoid = false; hasoid = false;
tupDesc = ExecTypeFromTL(outerNode->targetlist, hasoid); tupDesc = ExecTypeFromTL(outerNode->targetlist, hasoid);
gm_state->tupDesc = tupDesc; gm_state->tupDesc = tupDesc;
/* Now allocate the workspace for gather merge */
gather_merge_setup(gm_state);
return gm_state; return gm_state;
} }
@ -340,6 +345,9 @@ ExecReScanGatherMerge(GatherMergeState *node)
/* Make sure any existing workers are gracefully shut down */ /* Make sure any existing workers are gracefully shut down */
ExecShutdownGatherMergeWorkers(node); ExecShutdownGatherMergeWorkers(node);
/* Free any unused tuples, so we don't leak memory across rescans */
gather_merge_clear_tuples(node);
/* Mark node so that shared state will be rebuilt at next call */ /* Mark node so that shared state will be rebuilt at next call */
node->initialized = false; node->initialized = false;
node->gm_initialized = false; node->gm_initialized = false;
@ -370,9 +378,65 @@ ExecReScanGatherMerge(GatherMergeState *node)
} }
/* /*
* Initialize the Gather merge tuple read. * Set up the data structures that we'll need for Gather Merge.
* *
* Pull at least a single tuple from each worker + leader and set up the heap. * We allocate these once on the basis of gm->num_workers, which is an
* upper bound for the number of workers we'll actually have. During
* a rescan, we reset the structures to empty. This approach simplifies
* not leaking memory across rescans.
*
* In the gm_slots[] array, index 0 is for the leader, and indexes 1 to n
* are for workers. The values placed into gm_heap correspond to indexes
* in gm_slots[]. The gm_tuple_buffers[] array, however, is indexed from
* 0 to n-1; it has no entry for the leader.
*/
static void
gather_merge_setup(GatherMergeState *gm_state)
{
GatherMerge *gm = castNode(GatherMerge, gm_state->ps.plan);
int nreaders = gm->num_workers;
int i;
/*
* Allocate gm_slots for the number of workers + one more slot for leader.
* Slot 0 is always for the leader. Leader always calls ExecProcNode() to
* read the tuple, and then stores it directly into its gm_slots entry.
* For other slots, code below will call ExecInitExtraTupleSlot() to
* create a slot for the worker's results. Note that during any single
* scan, we might have fewer than num_workers available workers, in which
* case the extra array entries go unused.
*/
gm_state->gm_slots = (TupleTableSlot **)
palloc0((nreaders + 1) * sizeof(TupleTableSlot *));
/* Allocate the tuple slot and tuple array for each worker */
gm_state->gm_tuple_buffers = (GMReaderTupleBuffer *)
palloc0(nreaders * sizeof(GMReaderTupleBuffer));
for (i = 0; i < nreaders; i++)
{
/* Allocate the tuple array with length MAX_TUPLE_STORE */
gm_state->gm_tuple_buffers[i].tuple =
(HeapTuple *) palloc0(sizeof(HeapTuple) * MAX_TUPLE_STORE);
/* Initialize tuple slot for worker */
gm_state->gm_slots[i + 1] = ExecInitExtraTupleSlot(gm_state->ps.state);
ExecSetSlotDescriptor(gm_state->gm_slots[i + 1],
gm_state->tupDesc);
}
/* Allocate the resources for the merge */
gm_state->gm_heap = binaryheap_allocate(nreaders + 1,
heap_compare_slots,
gm_state);
}
/*
* Initialize the Gather Merge.
*
* Reset data structures to ensure they're empty. Then pull at least one
* tuple from leader + each worker (or set its "done" indicator), and set up
* the heap.
*/ */
static void static void
gather_merge_init(GatherMergeState *gm_state) gather_merge_init(GatherMergeState *gm_state)
@ -381,38 +445,26 @@ gather_merge_init(GatherMergeState *gm_state)
bool nowait = true; bool nowait = true;
int i; int i;
/* /* Assert that gather_merge_setup made enough space */
* Allocate gm_slots for the number of workers + one more slot for leader. Assert(nreaders <= castNode(GatherMerge, gm_state->ps.plan)->num_workers);
* Last slot is always for leader. Leader always calls ExecProcNode() to
* read the tuple which will return the TupleTableSlot. Later it will
* directly get assigned to gm_slot. So just initialize leader gm_slot
* with NULL. For other slots, code below will call
* ExecInitExtraTupleSlot() to create a slot for the worker's results.
*/
gm_state->gm_slots =
palloc((gm_state->nreaders + 1) * sizeof(TupleTableSlot *));
gm_state->gm_slots[gm_state->nreaders] = NULL;
/* Initialize the tuple slot and tuple array for each worker */ /* Reset leader's tuple slot to empty */
gm_state->gm_tuple_buffers = gm_state->gm_slots[0] = NULL;
(GMReaderTupleBuffer *) palloc0(sizeof(GMReaderTupleBuffer) *
gm_state->nreaders); /* Reset the tuple slot and tuple array for each worker */
for (i = 0; i < gm_state->nreaders; i++) for (i = 0; i < nreaders; i++)
{ {
/* Allocate the tuple array with length MAX_TUPLE_STORE */ /* Reset tuple array to empty */
gm_state->gm_tuple_buffers[i].tuple = gm_state->gm_tuple_buffers[i].nTuples = 0;
(HeapTuple *) palloc0(sizeof(HeapTuple) * MAX_TUPLE_STORE); gm_state->gm_tuple_buffers[i].readCounter = 0;
/* Reset done flag to not-done */
/* Initialize slot for worker */ gm_state->gm_tuple_buffers[i].done = false;
gm_state->gm_slots[i] = ExecInitExtraTupleSlot(gm_state->ps.state); /* Ensure output slot is empty */
ExecSetSlotDescriptor(gm_state->gm_slots[i], ExecClearTuple(gm_state->gm_slots[i + 1]);
gm_state->tupDesc);
} }
/* Allocate the resources for the merge */ /* Reset binary heap to empty */
gm_state->gm_heap = binaryheap_allocate(gm_state->nreaders + 1, binaryheap_reset(gm_state->gm_heap);
heap_compare_slots,
gm_state);
/* /*
* First, try to read a tuple from each worker (including leader) in * First, try to read a tuple from each worker (including leader) in
@ -422,14 +474,13 @@ gather_merge_init(GatherMergeState *gm_state)
* least one tuple) to the heap. * least one tuple) to the heap.
*/ */
reread: reread:
for (i = 0; i < nreaders + 1; i++) for (i = 0; i <= nreaders; i++)
{ {
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
/* ignore this source if already known done */ /* skip this source if already known done */
if ((i < nreaders) ? if ((i == 0) ? gm_state->need_to_scan_locally :
!gm_state->gm_tuple_buffers[i].done : !gm_state->gm_tuple_buffers[i - 1].done)
gm_state->need_to_scan_locally)
{ {
if (TupIsNull(gm_state->gm_slots[i])) if (TupIsNull(gm_state->gm_slots[i]))
{ {
@ -450,9 +501,9 @@ reread:
} }
/* need not recheck leader, since nowait doesn't matter for it */ /* need not recheck leader, since nowait doesn't matter for it */
for (i = 0; i < nreaders; i++) for (i = 1; i <= nreaders; i++)
{ {
if (!gm_state->gm_tuple_buffers[i].done && if (!gm_state->gm_tuple_buffers[i - 1].done &&
TupIsNull(gm_state->gm_slots[i])) TupIsNull(gm_state->gm_slots[i]))
{ {
nowait = false; nowait = false;
@ -467,23 +518,23 @@ reread:
} }
/* /*
* Clear out the tuple table slots for each gather merge input. * Clear out the tuple table slot, and any unused pending tuples,
* for each gather merge input.
*/ */
static void static void
gather_merge_clear_slots(GatherMergeState *gm_state) gather_merge_clear_tuples(GatherMergeState *gm_state)
{ {
int i; int i;
for (i = 0; i < gm_state->nreaders; i++) for (i = 0; i < gm_state->nreaders; i++)
{ {
pfree(gm_state->gm_tuple_buffers[i].tuple); GMReaderTupleBuffer *tuple_buffer = &gm_state->gm_tuple_buffers[i];
ExecClearTuple(gm_state->gm_slots[i]);
}
/* Free tuple array as we don't need it any more */ while (tuple_buffer->readCounter < tuple_buffer->nTuples)
pfree(gm_state->gm_tuple_buffers); heap_freetuple(tuple_buffer->tuple[tuple_buffer->readCounter++]);
/* Free the binaryheap, which was created for sort */
binaryheap_free(gm_state->gm_heap); ExecClearTuple(gm_state->gm_slots[i + 1]);
}
} }
/* /*
@ -526,7 +577,7 @@ gather_merge_getnext(GatherMergeState *gm_state)
if (binaryheap_empty(gm_state->gm_heap)) if (binaryheap_empty(gm_state->gm_heap))
{ {
/* All the queues are exhausted, and so is the heap */ /* All the queues are exhausted, and so is the heap */
gather_merge_clear_slots(gm_state); gather_merge_clear_tuples(gm_state);
return NULL; return NULL;
} }
else else
@ -548,10 +599,10 @@ load_tuple_array(GatherMergeState *gm_state, int reader)
int i; int i;
/* Don't do anything if this is the leader. */ /* Don't do anything if this is the leader. */
if (reader == gm_state->nreaders) if (reader == 0)
return; return;
tuple_buffer = &gm_state->gm_tuple_buffers[reader]; tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1];
/* If there's nothing in the array, reset the counters to zero. */ /* If there's nothing in the array, reset the counters to zero. */
if (tuple_buffer->nTuples == tuple_buffer->readCounter) if (tuple_buffer->nTuples == tuple_buffer->readCounter)
@ -590,7 +641,7 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
* If we're being asked to generate a tuple from the leader, then we just * If we're being asked to generate a tuple from the leader, then we just
* call ExecProcNode as normal to produce one. * call ExecProcNode as normal to produce one.
*/ */
if (gm_state->nreaders == reader) if (reader == 0)
{ {
if (gm_state->need_to_scan_locally) if (gm_state->need_to_scan_locally)
{ {
@ -601,7 +652,7 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
if (!TupIsNull(outerTupleSlot)) if (!TupIsNull(outerTupleSlot))
{ {
gm_state->gm_slots[reader] = outerTupleSlot; gm_state->gm_slots[0] = outerTupleSlot;
return true; return true;
} }
/* need_to_scan_locally serves as "done" flag for leader */ /* need_to_scan_locally serves as "done" flag for leader */
@ -611,7 +662,7 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
} }
/* Otherwise, check the state of the relevant tuple buffer. */ /* Otherwise, check the state of the relevant tuple buffer. */
tuple_buffer = &gm_state->gm_tuple_buffers[reader]; tuple_buffer = &gm_state->gm_tuple_buffers[reader - 1];
if (tuple_buffer->nTuples > tuple_buffer->readCounter) if (tuple_buffer->nTuples > tuple_buffer->readCounter)
{ {
@ -621,8 +672,8 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
else if (tuple_buffer->done) else if (tuple_buffer->done)
{ {
/* Reader is known to be exhausted. */ /* Reader is known to be exhausted. */
DestroyTupleQueueReader(gm_state->reader[reader]); DestroyTupleQueueReader(gm_state->reader[reader - 1]);
gm_state->reader[reader] = NULL; gm_state->reader[reader - 1] = NULL;
return false; return false;
} }
else else
@ -649,14 +700,14 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
ExecStoreTuple(tup, /* tuple to store */ ExecStoreTuple(tup, /* tuple to store */
gm_state->gm_slots[reader], /* slot in which to store the gm_state->gm_slots[reader], /* slot in which to store the
* tuple */ * tuple */
InvalidBuffer, /* buffer associated with this tuple */ InvalidBuffer, /* no buffer associated with tuple */
true); /* pfree this pointer if not from heap */ true); /* pfree tuple when done with it */
return true; return true;
} }
/* /*
* Attempt to read a tuple from given reader. * Attempt to read a tuple from given worker.
*/ */
static HeapTuple static HeapTuple
gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait, gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
@ -671,7 +722,7 @@ gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
CHECK_FOR_INTERRUPTS(); CHECK_FOR_INTERRUPTS();
/* Attempt to read a tuple. */ /* Attempt to read a tuple. */
reader = gm_state->reader[nreader]; reader = gm_state->reader[nreader - 1];
/* Run TupleQueueReaders in per-tuple context */ /* Run TupleQueueReaders in per-tuple context */
tupleContext = gm_state->ps.ps_ExprContext->ecxt_per_tuple_memory; tupleContext = gm_state->ps.ps_ExprContext->ecxt_per_tuple_memory;

View File

@ -1958,7 +1958,8 @@ typedef struct GatherMergeState
int gm_nkeys; /* number of sort columns */ int gm_nkeys; /* number of sort columns */
SortSupport gm_sortkeys; /* array of length gm_nkeys */ SortSupport gm_sortkeys; /* array of length gm_nkeys */
struct ParallelExecutorInfo *pei; struct ParallelExecutorInfo *pei;
/* all remaining fields are reinitialized during a rescan: */ /* all remaining fields are reinitialized during a rescan */
/* (but the arrays are not reallocated, just cleared) */
int nworkers_launched; /* original number of workers */ int nworkers_launched; /* original number of workers */
int nreaders; /* number of active workers */ int nreaders; /* number of active workers */
TupleTableSlot **gm_slots; /* array with nreaders+1 entries */ TupleTableSlot **gm_slots; /* array with nreaders+1 entries */