postgresql/src/backend/executor/nodeGatherMerge.c

/*-------------------------------------------------------------------------
 *
 * nodeGatherMerge.c
 *		Scan a plan in multiple workers, and do order-preserving merge.
 *
 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *	  src/backend/executor/nodeGatherMerge.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "access/relscan.h"
#include "access/xact.h"
#include "executor/execdebug.h"
#include "executor/execParallel.h"
#include "executor/nodeGatherMerge.h"
#include "executor/nodeSubplan.h"
#include "executor/tqueue.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "utils/memutils.h"
#include "utils/rel.h"

/*
 * Tuple array for each worker
 */
typedef struct GMReaderTupleBuffer
{
	HeapTuple  *tuple;
	int			readCounter;
	int			nTuples;
	bool		done;
} GMReaderTupleBuffer;

/*
 * When we read tuples from workers, it's a good idea to read several at once
 * for efficiency when possible: this minimizes context-switching overhead.
 * But reading too many at a time wastes memory without improving performance.
 */
#define MAX_TUPLE_STORE 10

static int32 heap_compare_slots(Datum a, Datum b, void *arg);
static TupleTableSlot *gather_merge_getnext(GatherMergeState *gm_state);
static HeapTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader,
				  bool nowait, bool *done);
static void gather_merge_init(GatherMergeState *gm_state);
static void ExecShutdownGatherMergeWorkers(GatherMergeState *node);
static bool gather_merge_readnext(GatherMergeState *gm_state, int reader,
					  bool nowait);
static void form_tuple_array(GatherMergeState *gm_state, int reader);

/* ----------------------------------------------------------------
 *		ExecInitGather
 * ----------------------------------------------------------------
 */
GatherMergeState *
ExecInitGatherMerge(GatherMerge *node, EState *estate, int eflags)
{
	GatherMergeState *gm_state;
	Plan	   *outerNode;
	bool		hasoid;
	TupleDesc	tupDesc;

	/* Gather merge node doesn't have innerPlan node. */
	Assert(innerPlan(node) == NULL);

	/*
	 * create state structure
	 */
	gm_state = makeNode(GatherMergeState);
	gm_state->ps.plan = (Plan *) node;
	gm_state->ps.state = estate;

	/*
	 * Miscellaneous initialization
	 *
	 * create expression context for node
	 */
	ExecAssignExprContext(estate, &gm_state->ps);

	/*
	 * initialize child expressions
	 */
	gm_state->ps.qual =
		ExecInitQual(node->plan.qual, &gm_state->ps);

	/*
	 * tuple table initialization
	 */
	ExecInitResultTupleSlot(estate, &gm_state->ps);

	/*
	 * now initialize outer plan
	 */
	outerNode = outerPlan(node);
	outerPlanState(gm_state) = ExecInitNode(outerNode, estate, eflags);

	/*
	 * Initialize result tuple type and projection info.
	 */
	ExecAssignResultTypeFromTL(&gm_state->ps);
	ExecAssignProjectionInfo(&gm_state->ps, NULL);

	gm_state->gm_initialized = false;

	/*
	 * initialize sort-key information
	 */
	if (node->numCols)
	{
		int			i;

		gm_state->gm_nkeys = node->numCols;
		gm_state->gm_sortkeys =
			palloc0(sizeof(SortSupportData) * node->numCols);

		for (i = 0; i < node->numCols; i++)
		{
			SortSupport sortKey = gm_state->gm_sortkeys + i;

			sortKey->ssup_cxt = CurrentMemoryContext;
			sortKey->ssup_collation = node->collations[i];
			sortKey->ssup_nulls_first = node->nullsFirst[i];
			sortKey->ssup_attno = node->sortColIdx[i];

			/*
			 * We don't perform abbreviated key conversion here, for the same
			 * reasons that it isn't used in MergeAppend
			 */
			sortKey->abbreviate = false;

			PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey);
		}
	}

	/*
	 * store the tuple descriptor into gather merge state, so we can use it
	 * later while initializing the gather merge slots.
	 */
	if (!ExecContextForcesOids(&gm_state->ps, &hasoid))
		hasoid = false;
	tupDesc = ExecTypeFromTL(outerNode->targetlist, hasoid);
	gm_state->tupDesc = tupDesc;

	return gm_state;
}

/* ----------------------------------------------------------------
 *		ExecGatherMerge(node)
 *
 *		Scans the relation via multiple workers and returns
 *		the next qualifying tuple.
 * ----------------------------------------------------------------
 */
TupleTableSlot *
ExecGatherMerge(GatherMergeState *node)
{
	TupleTableSlot *slot;
	ExprContext *econtext;
	int			i;

	/*
	 * As with Gather, we don't launch workers until this node is actually
	 * executed.
	 */
	if (!node->initialized)
	{
		EState	   *estate = node->ps.state;
		GatherMerge *gm = (GatherMerge *) node->ps.plan;

		/*
		 * Sometimes we might have to run without parallelism; but if parallel
		 * mode is active then we can try to fire up some workers.
		 */
		if (gm->num_workers > 0 && IsInParallelMode())
		{
			ParallelContext *pcxt;

			/* Initialize data structures for workers. */
			if (!node->pei)
				node->pei = ExecInitParallelPlan(node->ps.lefttree,
												 estate,
												 gm->num_workers);

			/* Try to launch workers. */
			pcxt = node->pei->pcxt;
			LaunchParallelWorkers(pcxt);
			node->nworkers_launched = pcxt->nworkers_launched;

			/* Set up tuple queue readers to read the results. */
			if (pcxt->nworkers_launched > 0)
			{
				node->nreaders = 0;
				node->reader = palloc(pcxt->nworkers_launched *
									  sizeof(TupleQueueReader *));

				Assert(gm->numCols);

				for (i = 0; i < pcxt->nworkers_launched; ++i)
				{
					shm_mq_set_handle(node->pei->tqueue[i],
									  pcxt->worker[i].bgwhandle);
					node->reader[node->nreaders++] =
						CreateTupleQueueReader(node->pei->tqueue[i],
											   node->tupDesc);
				}
			}
			else
			{
				/* No workers?	Then never mind. */
				ExecShutdownGatherMergeWorkers(node);
			}
		}

		/* always allow leader to participate */
		node->need_to_scan_locally = true;
		node->initialized = true;
	}

	/*
	 * Reset per-tuple memory context to free any expression evaluation
	 * storage allocated in the previous tuple cycle.
	 */
	econtext = node->ps.ps_ExprContext;
	ResetExprContext(econtext);

	/*
	 * Get next tuple, either from one of our workers, or by running the plan
	 * ourselves.
	 */
	slot = gather_merge_getnext(node);
	if (TupIsNull(slot))
		return NULL;

	/*
	 * form the result tuple using ExecProject(), and return it --- unless the
	 * projection produces an empty set, in which case we must loop back
	 * around for another tuple
	 */
	econtext->ecxt_outertuple = slot;
	return ExecProject(node->ps.ps_ProjInfo);
}

/* ----------------------------------------------------------------
 *		ExecEndGatherMerge
 *
 *		frees any storage allocated through C routines.
 * ----------------------------------------------------------------
 */
void
ExecEndGatherMerge(GatherMergeState *node)
{
	ExecEndNode(outerPlanState(node));	/* let children clean up first */
	ExecShutdownGatherMerge(node);
	ExecFreeExprContext(&node->ps);
	ExecClearTuple(node->ps.ps_ResultTupleSlot);
}

/* ----------------------------------------------------------------
 *		ExecShutdownGatherMerge
 *
 *		Destroy the setup for parallel workers including parallel context.
 *		Collect all the stats after workers are stopped, else some work
 *		done by workers won't be accounted.
 * ----------------------------------------------------------------
 */
void
ExecShutdownGatherMerge(GatherMergeState *node)
{
	ExecShutdownGatherMergeWorkers(node);

	/* Now destroy the parallel context. */
	if (node->pei != NULL)
	{
		ExecParallelCleanup(node->pei);
		node->pei = NULL;
	}
}

/* ----------------------------------------------------------------
 *		ExecShutdownGatherMergeWorkers
 *
 *		Destroy the parallel workers.  Collect all the stats after
 *		workers are stopped, else some work done by workers won't be
 *		accounted.
 * ----------------------------------------------------------------
 */
static void
ExecShutdownGatherMergeWorkers(GatherMergeState *node)
{
	/* Shut down tuple queue readers before shutting down workers. */
	if (node->reader != NULL)
	{
		int			i;

		for (i = 0; i < node->nreaders; ++i)
			if (node->reader[i])
				DestroyTupleQueueReader(node->reader[i]);

		pfree(node->reader);
		node->reader = NULL;
	}

	/* Now shut down the workers. */
	if (node->pei != NULL)
		ExecParallelFinish(node->pei);
}

/* ----------------------------------------------------------------
 *		ExecReScanGatherMerge
 *
 *		Re-initialize the workers and rescans a relation via them.
 * ----------------------------------------------------------------
 */
void
ExecReScanGatherMerge(GatherMergeState *node)
{
	/*
	 * Re-initialize the parallel workers to perform rescan of relation. We
	 * want to gracefully shutdown all the workers so that they should be able
	 * to propagate any error or other information to master backend before
	 * dying.  Parallel context will be reused for rescan.
	 */
	ExecShutdownGatherMergeWorkers(node);

	node->initialized = false;

	if (node->pei)
		ExecParallelReinitialize(node->pei);

	ExecReScan(node->ps.lefttree);
}

/*
 * Initialize the Gather merge tuple read.
 *
 * Pull at least a single tuple from each worker + leader and set up the heap.
 */
static void
gather_merge_init(GatherMergeState *gm_state)
{
	int			nreaders = gm_state->nreaders;
	bool		initialize = true;
	int			i;

	/*
	 * Allocate gm_slots for the number of worker + one more slot for leader.
	 * Last slot is always for leader. Leader always calls ExecProcNode() to
	 * read the tuple which will return the TupleTableSlot. Later it will
	 * directly get assigned to gm_slot. So just initialize leader gm_slot
	 * with NULL. For other slots below code will call
	 * ExecInitExtraTupleSlot() which will do the initialization of worker
	 * slots.
	 */
	gm_state->gm_slots =
		palloc((gm_state->nreaders + 1) * sizeof(TupleTableSlot *));
	gm_state->gm_slots[gm_state->nreaders] = NULL;

	/* Initialize the tuple slot and tuple array for each worker */
	gm_state->gm_tuple_buffers =
		(GMReaderTupleBuffer *) palloc0(sizeof(GMReaderTupleBuffer) *
										(gm_state->nreaders + 1));
	for (i = 0; i < gm_state->nreaders; i++)
	{
		/* Allocate the tuple array with MAX_TUPLE_STORE size */
		gm_state->gm_tuple_buffers[i].tuple =
			(HeapTuple *) palloc0(sizeof(HeapTuple) * MAX_TUPLE_STORE);

		/* Initialize slot for worker */
		gm_state->gm_slots[i] = ExecInitExtraTupleSlot(gm_state->ps.state);
		ExecSetSlotDescriptor(gm_state->gm_slots[i],
							  gm_state->tupDesc);
	}

	/* Allocate the resources for the merge */
	gm_state->gm_heap = binaryheap_allocate(gm_state->nreaders + 1,
											heap_compare_slots,
											gm_state);

	/*
	 * First, try to read a tuple from each worker (including leader) in
	 * nowait mode, so that we initialize read from each worker as well as
	 * leader. After this, if all active workers are unable to produce a
	 * tuple, then re-read and this time use wait mode. For workers that were
	 * able to produce a tuple in the earlier loop and are still active, just
	 * try to fill the tuple array if more tuples are avaiable.
	 */
reread:
	for (i = 0; i < nreaders + 1; i++)
	{
		if (!gm_state->gm_tuple_buffers[i].done &&
			(TupIsNull(gm_state->gm_slots[i]) ||
			 gm_state->gm_slots[i]->tts_isempty))
		{
			if (gather_merge_readnext(gm_state, i, initialize))
			{
				binaryheap_add_unordered(gm_state->gm_heap,
										 Int32GetDatum(i));
			}
		}
		else
			form_tuple_array(gm_state, i);
	}
	initialize = false;

	for (i = 0; i < nreaders; i++)
		if (!gm_state->gm_tuple_buffers[i].done &&
			(TupIsNull(gm_state->gm_slots[i]) ||
			 gm_state->gm_slots[i]->tts_isempty))
			goto reread;

	binaryheap_build(gm_state->gm_heap);
	gm_state->gm_initialized = true;
}

/*
 * Clear out the tuple table slots for each gather merge input.
 */
static void
gather_merge_clear_slots(GatherMergeState *gm_state)
{
	int			i;

	for (i = 0; i < gm_state->nreaders; i++)
	{
		pfree(gm_state->gm_tuple_buffers[i].tuple);
		gm_state->gm_slots[i] = ExecClearTuple(gm_state->gm_slots[i]);
	}

	/* Free tuple array as we don't need it any more */
	pfree(gm_state->gm_tuple_buffers);
	/* Free the binaryheap, which was created for sort */
	binaryheap_free(gm_state->gm_heap);
}

/*
 * Read the next tuple for gather merge.
 *
 * Fetch the sorted tuple out of the heap.
 */
static TupleTableSlot *
gather_merge_getnext(GatherMergeState *gm_state)
{
	int			i;

	if (!gm_state->gm_initialized)
	{
		/*
		 * First time through: pull the first tuple from each participant, and
		 * set up the heap.
		 */
		gather_merge_init(gm_state);
	}
	else
	{
		/*
		 * Otherwise, pull the next tuple from whichever participant we
		 * returned from last time, and reinsert that participant's index into
		 * the heap, because it might now compare differently against the
		 * other elements of the heap.
		 */
		i = DatumGetInt32(binaryheap_first(gm_state->gm_heap));

		if (gather_merge_readnext(gm_state, i, false))
			binaryheap_replace_first(gm_state->gm_heap, Int32GetDatum(i));
		else
			(void) binaryheap_remove_first(gm_state->gm_heap);
	}

	if (binaryheap_empty(gm_state->gm_heap))
	{
		/* All the queues are exhausted, and so is the heap */
		gather_merge_clear_slots(gm_state);
		return NULL;
	}
	else
	{
		/* Return next tuple from whichever participant has the leading one */
		i = DatumGetInt32(binaryheap_first(gm_state->gm_heap));
		return gm_state->gm_slots[i];
	}
}

/*
 * Read the tuple for given reader in nowait mode, and form the tuple array.
 */
static void
form_tuple_array(GatherMergeState *gm_state, int reader)
{
	GMReaderTupleBuffer *tuple_buffer = &gm_state->gm_tuple_buffers[reader];
	int			i;

	/* Last slot is for leader and we don't build tuple array for leader */
	if (reader == gm_state->nreaders)
		return;

	/*
	 * We here because we already read all the tuples from the tuple array, so
	 * initialize the counter to zero.
	 */
	if (tuple_buffer->nTuples == tuple_buffer->readCounter)
		tuple_buffer->nTuples = tuple_buffer->readCounter = 0;

	/* Tuple array is already full? */
	if (tuple_buffer->nTuples == MAX_TUPLE_STORE)
		return;

	for (i = tuple_buffer->nTuples; i < MAX_TUPLE_STORE; i++)
	{
		tuple_buffer->tuple[i] = heap_copytuple(gm_readnext_tuple(gm_state,
																  reader,
																  false,
													   &tuple_buffer->done));
		if (!HeapTupleIsValid(tuple_buffer->tuple[i]))
			break;
		tuple_buffer->nTuples++;
	}
}

/*
 * Store the next tuple for a given reader into the appropriate slot.
 *
 * Returns false if the reader is exhausted, and true otherwise.
 */
static bool
gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
{
	GMReaderTupleBuffer *tuple_buffer;
	HeapTuple	tup = NULL;

	/*
	 * If we're being asked to generate a tuple from the leader, then we just
	 * call ExecProcNode as normal to produce one.
	 */
	if (gm_state->nreaders == reader)
	{
		if (gm_state->need_to_scan_locally)
		{
			PlanState  *outerPlan = outerPlanState(gm_state);
			TupleTableSlot *outerTupleSlot;

			outerTupleSlot = ExecProcNode(outerPlan);

			if (!TupIsNull(outerTupleSlot))
			{
				gm_state->gm_slots[reader] = outerTupleSlot;
				return true;
			}
			gm_state->gm_tuple_buffers[reader].done = true;
			gm_state->need_to_scan_locally = false;
		}
		return false;
	}

	/* Otherwise, check the state of the relevant tuple buffer. */
	tuple_buffer = &gm_state->gm_tuple_buffers[reader];

	if (tuple_buffer->nTuples > tuple_buffer->readCounter)
	{
		/* Return any tuple previously read that is still buffered. */
		tuple_buffer = &gm_state->gm_tuple_buffers[reader];
		tup = tuple_buffer->tuple[tuple_buffer->readCounter++];
	}
	else if (tuple_buffer->done)
	{
		/* Reader is known to be exhausted. */
		DestroyTupleQueueReader(gm_state->reader[reader]);
		gm_state->reader[reader] = NULL;
		return false;
	}
	else
	{
		/* Read and buffer next tuple. */
		tup = heap_copytuple(gm_readnext_tuple(gm_state,
											   reader,
											   nowait,
											   &tuple_buffer->done));

		/*
		 * Attempt to read more tuples in nowait mode and store them in the
		 * tuple array.
		 */
		if (HeapTupleIsValid(tup))
			form_tuple_array(gm_state, reader);
		else
			return false;
	}

	Assert(HeapTupleIsValid(tup));

	/* Build the TupleTableSlot for the given tuple */
	ExecStoreTuple(tup,			/* tuple to store */
				   gm_state->gm_slots[reader],	/* slot in which to store the
												 * tuple */
				   InvalidBuffer,		/* buffer associated with this tuple */
				   true);		/* pfree this pointer if not from heap */

	return true;
}

/*
 * Attempt to read a tuple from given reader.
 */
static HeapTuple
gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
				  bool *done)
{
	TupleQueueReader *reader;
	HeapTuple	tup = NULL;
	MemoryContext oldContext;
	MemoryContext tupleContext;

	tupleContext = gm_state->ps.ps_ExprContext->ecxt_per_tuple_memory;

	if (done != NULL)
		*done = false;

	/* Check for async events, particularly messages from workers. */
	CHECK_FOR_INTERRUPTS();

	/* Attempt to read a tuple. */
	reader = gm_state->reader[nreader];

	/* Run TupleQueueReaders in per-tuple context */
	oldContext = MemoryContextSwitchTo(tupleContext);
	tup = TupleQueueReaderNext(reader, nowait, done);
	MemoryContextSwitchTo(oldContext);

	return tup;
}

/*
 * We have one slot for each item in the heap array.  We use SlotNumber
 * to store slot indexes.  This doesn't actually provide any formal
 * type-safety, but it makes the code more self-documenting.
 */
typedef int32 SlotNumber;

/*
 * Compare the tuples in the two given slots.
 */
static int32
heap_compare_slots(Datum a, Datum b, void *arg)
{
	GatherMergeState *node = (GatherMergeState *) arg;
	SlotNumber	slot1 = DatumGetInt32(a);
	SlotNumber	slot2 = DatumGetInt32(b);

	TupleTableSlot *s1 = node->gm_slots[slot1];
	TupleTableSlot *s2 = node->gm_slots[slot2];
	int			nkey;

	Assert(!TupIsNull(s1));
	Assert(!TupIsNull(s2));

	for (nkey = 0; nkey < node->gm_nkeys; nkey++)
	{
		SortSupport sortKey = node->gm_sortkeys + nkey;
		AttrNumber	attno = sortKey->ssup_attno;
		Datum		datum1,
					datum2;
		bool		isNull1,
					isNull2;
		int			compare;

		datum1 = slot_getattr(s1, attno, &isNull1);
		datum2 = slot_getattr(s2, attno, &isNull2);

		compare = ApplySortComparator(datum1, isNull1,
									  datum2, isNull2,
									  sortKey);
		if (compare != 0)
			return -compare;
	}
	return 0;
}