Use quicksort, not replacement selection, for external sorting.

We still use replacement selection for the first run of the sort only and only when the number of tuples is relatively small. Otherwise, the first run, and subsequent runs in all cases, are produced using quicksort. This tends to be faster except perhaps for very small amounts of working memory. Peter Geoghegan, reviewed by Tomas Vondra, Jeff Janes, Mithun Cy, Greg Stark, and me.
2016-04-08 02:36:26 -04:00 · 2016-04-08 02:36:26 -04:00 · 0711803775
parent 719c84c1be
commit 0711803775
7 changed files with 431 additions and 92 deletions
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@ -1472,6 +1472,45 @@ include_dir 'conf.d'
      </listitem>
     </varlistentry>

+     <varlistentry id="guc-replacement-sort-tuples" xreflabel="replacement_sort_tuples">
+      <term><varname>replacement_sort_tuples</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>replacement_sort_tuples</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        When the number of tuples to be sorted is smaller than this number,
+        a sort will produce its first output run using replacement selection
+        rather than quicksort.  This may be useful in memory-constrained
+        environments where tuples that are input into larger sort operations
+        have a strong physical-to-logical correlation.  Note that this does
+        not include input tuples with an <emphasis>inverse</emphasis>
+        correlation.  It is possible for the replacement selection algorithm
+        to generate one long run that requires no merging, where use of the
+        default strategy would result in many runs that must be merged
+        to produce a final sorted output.  This may allow sort
+        operations to complete sooner.
+       </para>
+       <para>
+        The default is 150,000 tuples.  Note that higher values are typically
+        not much more effective, and may be counter-productive, since the
+        priority queue is sensitive to the size of available CPU cache, whereas
+        the default strategy sorts runs using a <firstterm>cache
+        oblivious</firstterm> algorithm.  This property allows the default sort
+        strategy to automatically and transparently make effective use
+        of available CPU cache.
+       </para>
+       <para>
+        Setting <varname>maintenance_work_mem</varname> to its default
+        value usually prevents utility command external sorts (e.g.,
+        sorts used by <command>CREATE INDEX</> to build B-Tree
+        indexes) from ever using replacement selection sort, unless the
+        input tuples are quite wide.
+       </para>
+      </listitem>
+     </varlistentry>
+
     <varlistentry id="guc-autovacuum-work-mem" xreflabel="autovacuum_work_mem">
      <term><varname>autovacuum_work_mem</varname> (<type>integer</type>)
      <indexterm>
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@ -1432,8 +1432,8 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
 * total, but we will also need to write and read each tuple once per
 * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
 * number of initial runs formed and M is the merge order used by tuplesort.c.
- * Since the average initial run should be about twice sort_mem, we have
- *		disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem)))
+ * Since the average initial run should be about sort_mem, we have
+ *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
 *		cpu = comparison_cost * t * log2(t)
 *
 * If the sort is bounded (i.e., only the first k result tuples are needed)
@ -1509,7 +1509,7 @@ cost_sort(Path *path, PlannerInfo *root,
 		 * We'll have to use a disk-based sort of all the tuples
 		 */
 		double		npages = ceil(input_bytes / BLCKSZ);
-		double		nruns = (input_bytes / sort_mem_bytes) * 0.5;
+		double		nruns = input_bytes / sort_mem_bytes;
 		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
 		double		log_runs;
 		double		npageaccesses;
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@ -109,6 +109,7 @@ bool		enableFsync = true;
 bool		allowSystemTableMods = false;
 int			work_mem = 1024;
 int			maintenance_work_mem = 16384;
+int			replacement_sort_tuples = 150000;

 /*
 * Primary determinants of sizes of shared-memory structures.
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@ -1928,6 +1928,16 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},

+	{
+		{"replacement_sort_tuples", PGC_USERSET, RESOURCES_MEM,
+			gettext_noop("Sets the maximum number of tuples to be sorted using replacement selection."),
+			gettext_noop("When more tuples than this are present, quicksort will be used.")
+		},
+		&replacement_sort_tuples,
+		150000, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * We use the hopefully-safely-small value of 100kB as the compiled-in
 	 * default for max_stack_depth.  InitializeGUCOptions will increase it if
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@ -125,6 +125,7 @@
 # actively intend to use prepared transactions.
 #work_mem = 4MB				# min 64kB
 #maintenance_work_mem = 64MB		# min 1MB
+#replacement_sort_tuples = 150000	# limits use of replacement selection sort
 #autovacuum_work_mem = -1		# min 1MB, or -1 to use maintenance_work_mem
 #max_stack_depth = 2MB			# min 100kB
 #dynamic_shared_memory_type = posix	# the default is the first option
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@ -11,14 +11,16 @@
 * algorithm.
 *
 * See Knuth, volume 3, for more than you want to know about the external
- * sorting algorithm.  We divide the input into sorted runs using replacement
- * selection, in the form of a priority tree implemented as a heap
- * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase
- * merge, Knuth's Algorithm 5.4.2D.  The logical "tapes" used by Algorithm D
- * are implemented by logtape.c, which avoids space wastage by recycling
- * disk space as soon as each block is read from its "tape".
+ * sorting algorithm.  Historically, we divided the input into sorted runs
+ * using replacement selection, in the form of a priority tree implemented
+ * as a heap (essentially his Algorithm 5.2.3H -- although that strategy is
+ * often avoided altogether), but that can now only happen first the first
+ * run.  We merge the runs using polyphase merge, Knuth's Algorithm
+ * 5.4.2D.  The logical "tapes" used by Algorithm D are implemented by
+ * logtape.c, which avoids space wastage by recycling disk space as soon
+ * as each block is read from its "tape".
 *
- * We do not form the initial runs using Knuth's recommended replacement
+ * We never form the initial runs using Knuth's recommended replacement
 * selection data structure (Algorithm 5.4.1R), because it uses a fixed
 * number of records in memory at all times.  Since we are dealing with
 * tuples that may vary considerably in size, we want to be able to vary
@ -28,7 +30,30 @@
 * Algorithm 5.4.1R, each record is stored with the run number that it
 * must go into, and we use (run number, key) as the ordering key for the
 * heap.  When the run number at the top of the heap changes, we know that
- * no more records of the prior run are left in the heap.
+ * no more records of the prior run are left in the heap.  Note that there
+ * are in practice only ever two distinct run numbers, due to the greatly
+ * reduced use of replacement selection in PostgreSQL 9.6.
+ *
+ * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small
+ * customizations) is only used with the aim of producing just one run,
+ * thereby avoiding all merging.  Only the first run can use replacement
+ * selection, which is why there are now only two possible valid run
+ * numbers, and why heapification is customized to not distinguish between
+ * tuples in the second run (those will be quicksorted).  We generally
+ * prefer a simple hybrid sort-merge strategy, where runs are sorted in much
+ * the same way as the entire input of an internal sort is sorted (using
+ * qsort()).  The replacement_sort_tuples GUC controls the limited remaining
+ * use of replacement selection for the first run.
+ *
+ * There are several reasons to favor a hybrid sort-merge strategy.
+ * Maintaining a priority tree/heap has poor CPU cache characteristics.
+ * Furthermore, the growth in main memory sizes has greatly diminished the
+ * value of having runs that are larger than available memory, even in the
+ * case where there is partially sorted input and runs can be made far
+ * larger by using a heap.  In most cases, a single-pass merge step is all
+ * that is required even when runs are no larger than available memory.
+ * Avoiding multiple merge passes was traditionally considered to be the
+ * major advantage of using replacement selection.
 *
 * The approximate amount of memory allowed for any one sort operation
 * is specified in kilobytes by the caller (most pass work_mem).  Initially,
@ -36,13 +61,12 @@
 * we haven't exceeded workMem.  If we reach the end of the input without
 * exceeding workMem, we sort the array using qsort() and subsequently return
 * tuples just by scanning the tuple array sequentially.  If we do exceed
- * workMem, we construct a heap using Algorithm H and begin to emit tuples
- * into sorted runs in temporary tapes, emitting just enough tuples at each
- * step to get back within the workMem limit.  Whenever the run number at
- * the top of the heap changes, we begin a new run with a new output tape
- * (selected per Algorithm D).  After the end of the input is reached,
- * we dump out remaining tuples in memory into a final run (or two),
- * then merge the runs using Algorithm D.
+ * workMem, we begin to emit tuples into sorted runs in temporary tapes.
+ * When tuples are dumped in batch after quicksorting, we begin a new run
+ * with a new output tape (selected per Algorithm D).  After the end of the
+ * input is reached, we dump out remaining tuples in memory into a final run
+ * (or two, when replacement selection is still used), then merge the runs
+ * using Algorithm D.
 *
 * When merging runs, we use a heap containing just the frontmost tuple from
 * each source run; we repeatedly output the smallest tuple and insert the
@ -162,15 +186,18 @@ bool		optimize_bounded_sort = true;
 * described above.  Accordingly, "tuple" is always used in preference to
 * datum1 as the authoritative value for pass-by-reference cases.
 *
- * While building initial runs, tupindex holds the tuple's run number.  During
- * merge passes, we re-use it to hold the input tape number that each tuple in
- * the heap was read from, or to hold the index of the next tuple pre-read
- * from the same tape in the case of pre-read entries.  tupindex goes unused
- * if the sort occurs entirely in memory.
+ * While building initial runs, tupindex holds the tuple's run number.
+ * Historically, the run number could meaningfully distinguish many runs, but
+ * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement
+ * selection is always abandoned after the first run; no other run number
+ * should be represented here.  During merge passes, we re-use it to hold the
+ * input tape number that each tuple in the heap was read from, or to hold the
+ * index of the next tuple pre-read from the same tape in the case of pre-read
+ * entries.  tupindex goes unused if the sort occurs entirely in memory.
 */
 typedef struct
 {
-	void	   *tuple;			/* the tuple proper */
+	void	   *tuple;			/* the tuple itself */
 	Datum		datum1;			/* value of first key column */
 	bool		isnull1;		/* is first key column NULL? */
 	int			tupindex;		/* see notes above */
@ -206,6 +233,15 @@ typedef enum
 #define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)

+ /*
+  * Run numbers, used during external sort operations.
+  *
+  * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun.
+  */
+#define RUN_FIRST		0
+#define HEAP_RUN_NEXT	INT_MAX
+#define RUN_SECOND		1
+
 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
 												Tuplesortstate *state);

@ -292,9 +328,17 @@ struct Tuplesortstate
 	 */
 	bool		batchUsed;

+	/*
+	 * While building initial runs, this indicates if the replacement
+	 * selection strategy is in use.  When it isn't, then a simple hybrid
+	 * sort-merge strategy is in use instead (runs are quicksorted).
+	 */
+	bool		replaceActive;
+
 	/*
 	 * While building initial runs, this is the current output run number
-	 * (starting at 0).  Afterwards, it is the number of initial runs we made.
+	 * (starting at RUN_FIRST).  Afterwards, it is the number of initial
+	 * runs we made.
 	 */
 	int			currentRun;

@ -493,6 +537,7 @@ struct Tuplesortstate
 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
 static bool consider_abort_common(Tuplesortstate *state);
+static bool useselection(Tuplesortstate *state);
 static void inittapes(Tuplesortstate *state);
 static void selectnewtape(Tuplesortstate *state);
 static void mergeruns(Tuplesortstate *state);
@ -508,8 +553,10 @@ static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen);
 static void mergepreread(Tuplesortstate *state);
 static void mergeprereadone(Tuplesortstate *state, int srcTape);
 static void dumptuples(Tuplesortstate *state, bool alltuples);
+static void dumpbatch(Tuplesortstate *state, bool alltuples);
 static void make_bounded_heap(Tuplesortstate *state);
 static void sort_bounded_heap(Tuplesortstate *state);
+static void tuplesort_sort_memtuples(Tuplesortstate *state);
 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
 					  int tupleindex, bool checkIndex);
 static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
@ -654,7 +701,7 @@ tuplesort_begin_common(int workMem, bool randomAccess)
 	if (LACKMEM(state))
 		elog(ERROR, "insufficient memory allowed for sort");

-	state->currentRun = 0;
+	state->currentRun = RUN_FIRST;

 	/*
 	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
@ -1566,22 +1613,61 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple)

 			/*
 			 * Insert the tuple into the heap, with run number currentRun if
-			 * it can go into the current run, else run number currentRun+1.
-			 * The tuple can go into the current run if it is >= the first
+			 * it can go into the current run, else HEAP_RUN_NEXT.  The tuple
+			 * can go into the current run if it is >= the first
 			 * not-yet-output tuple.  (Actually, it could go into the current
 			 * run if it is >= the most recently output tuple ... but that
 			 * would require keeping around the tuple we last output, and it's
 			 * simplest to let writetup free each tuple as soon as it's
 			 * written.)
 			 *
-			 * Note there will always be at least one tuple in the heap at
-			 * this point; see dumptuples.
+			 * Note that this only applies when:
+			 *
+			 * - currentRun is RUN_FIRST
+			 *
+			 * - Replacement selection is in use (typically it is never used).
+			 *
+			 * When these two conditions are not both true, all tuples are
+			 * appended indifferently, much like the TSS_INITIAL case.
+			 *
+			 * There should always be room to store the incoming tuple.
 			 */
-			Assert(state->memtupcount > 0);
-			if (COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
+			Assert(!state->replaceActive || state->memtupcount > 0);
+			if (state->replaceActive &&
+				COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
+			{
+				Assert(state->currentRun == RUN_FIRST);
+
+				/*
+				 * Insert tuple into first, fully heapified run.
+				 *
+				 * Unlike classic replacement selection, which this module was
+				 * previously based on, only RUN_FIRST tuples are fully
+				 * heapified.  Any second/next run tuples are appended
+				 * indifferently.  While HEAP_RUN_NEXT tuples may be sifted
+				 * out of the way of first run tuples, COMPARETUP() will never
+				 * be called for the run's tuples during sifting (only our
+				 * initial COMPARETUP() call is required for the tuple, to
+				 * determine that the tuple does not belong in RUN_FIRST).
+				 */
 				tuplesort_heap_insert(state, tuple, state->currentRun, true);
+			}
 			else
-				tuplesort_heap_insert(state, tuple, state->currentRun + 1, true);
+			{
+				/*
+				 * Tuple was determined to not belong to heapified RUN_FIRST,
+				 * or replacement selection not in play.  Append the tuple to
+				 * memtuples indifferently.
+				 *
+				 * dumptuples() does not trust that the next run's tuples are
+				 * heapified.  Anything past the first run will always be
+				 * quicksorted even when replacement selection is initially
+				 * used.  (When it's never used, every tuple still takes this
+				 * path.)
+				 */
+				tuple->tupindex = HEAP_RUN_NEXT;
+				state->memtuples[state->memtupcount++] = *tuple;
+			}

 			/*
 			 * If we are over the memory limit, dump tuples till we're under.
@ -1658,18 +1744,7 @@ tuplesort_performsort(Tuplesortstate *state)
 			 * We were able to accumulate all the tuples within the allowed
 			 * amount of memory.  Just qsort 'em and we're done.
 			 */
-			if (state->memtupcount > 1)
-			{
-				/* Can we use the single-key sort function? */
-				if (state->onlyKey != NULL)
-					qsort_ssup(state->memtuples, state->memtupcount,
-							   state->onlyKey);
-				else
-					qsort_tuple(state->memtuples,
-								state->memtupcount,
-								state->comparetup,
-								state);
-			}
+			tuplesort_sort_memtuples(state);
 			state->current = 0;
 			state->eof_reached = false;
 			state->markpos_offset = 0;
@ -2181,6 +2256,28 @@ tuplesort_merge_order(int64 allowedMem)
 	return mOrder;
 }

+/*
+ * useselection - determine algorithm to use to sort first run.
+ *
+ * It can sometimes be useful to use the replacement selection algorithm if it
+ * results in one large run, and there is little available workMem.  See
+ * remarks on RUN_SECOND optimization within dumptuples().
+ */
+static bool
+useselection(Tuplesortstate *state)
+{
+	/*
+	 * memtupsize might be noticeably higher than memtupcount here in atypical
+	 * cases.  It seems slightly preferable to not allow recent outliers to
+	 * impact this determination.  Note that caller's trace_sort output reports
+	 * memtupcount instead.
+	 */
+	if (state->memtupsize <= replacement_sort_tuples)
+		return true;
+
+	return false;
+}
+
 /*
 * inittapes - initialize for tape sorting.
 *
@ -2190,7 +2287,6 @@ static void
 inittapes(Tuplesortstate *state)
 {
 	int			maxTapes,
-				ntuples,
 				j;
 	int64		tapeSpace;

@ -2253,25 +2349,42 @@ inittapes(Tuplesortstate *state)
 	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));

 	/*
-	 * Convert the unsorted contents of memtuples[] into a heap. Each tuple is
-	 * marked as belonging to run number zero.
-	 *
-	 * NOTE: we pass false for checkIndex since there's no point in comparing
-	 * indexes in this step, even though we do intend the indexes to be part
-	 * of the sort key...
+	 * Give replacement selection a try based on user setting.  There will
+	 * be a switch to a simple hybrid sort-merge strategy after the first
+	 * run (iff we could not output one long run).
 	 */
-	ntuples = state->memtupcount;
-	state->memtupcount = 0;		/* make the heap empty */
-	for (j = 0; j < ntuples; j++)
+	state->replaceActive = useselection(state);
+
+	if (state->replaceActive)
 	{
-		/* Must copy source tuple to avoid possible overwrite */
-		SortTuple	stup = state->memtuples[j];
+#ifdef TRACE_SORT
+		if (trace_sort)
+			elog(LOG, "replacement selection will sort %d first run tuples",
+				 state->memtupcount);
+#endif
+		/*
+		 * Convert the unsorted contents of memtuples[] into a heap. Each
+		 * tuple is marked as belonging to run number zero.
+		 *
+		 * NOTE: we pass false for checkIndex since there's no point in
+		 * comparing indexes in this step, even though we do intend the
+		 * indexes to be part of the sort key...
+		 */
+		int			ntuples = state->memtupcount;

-		tuplesort_heap_insert(state, &stup, 0, false);
+		state->memtupcount = 0;		/* make the heap empty */
+
+		for (j = 0; j < ntuples; j++)
+		{
+			/* Must copy source tuple to avoid possible overwrite */
+			SortTuple	stup = state->memtuples[j];
+
+			tuplesort_heap_insert(state, &stup, 0, false);
+		}
+		Assert(state->memtupcount == ntuples);
 	}
-	Assert(state->memtupcount == ntuples);

-	state->currentRun = 0;
+	state->currentRun = RUN_FIRST;

 	/*
 	 * Initialize variables of Algorithm D (step D1).
@ -2362,11 +2475,12 @@ mergeruns(Tuplesortstate *state)

 	/*
 	 * If we produced only one initial run (quite likely if the total data
-	 * volume is between 1X and 2X workMem), we can just use that tape as the
-	 * finished output, rather than doing a useless merge.  (This obvious
-	 * optimization is not in Knuth's algorithm.)
+	 * volume is between 1X and 2X workMem when replacement selection is used,
+	 * but something we particular count on when input is presorted), we can
+	 * just use that tape as the finished output, rather than doing a useless
+	 * merge.  (This obvious optimization is not in Knuth's algorithm.)
 	 */
-	if (state->currentRun == 1)
+	if (state->currentRun == RUN_SECOND)
 	{
 		state->result_tape = state->tp_tapenum[state->destTape];
 		/* must freeze and rewind the finished output tape */
@ -3094,21 +3208,25 @@ mergeprereadone(Tuplesortstate *state, int srcTape)
 }

 /*
- * dumptuples - remove tuples from heap and write to tape
+ * dumptuples - remove tuples from memtuples and write to tape
 *
 * This is used during initial-run building, but not during merging.
 *
- * When alltuples = false, dump only enough tuples to get under the
- * availMem limit (and leave at least one tuple in the heap in any case,
- * since puttuple assumes it always has a tuple to compare to).  We also
- * insist there be at least one free slot in the memtuples[] array.
+ * When alltuples = false and replacement selection is still active, dump
+ * only enough tuples to get under the availMem limit (and leave at least
+ * one tuple in memtuples, since puttuple will then assume it is a heap that
+ * has a tuple to compare to).  We always insist there be at least one free
+ * slot in the memtuples[] array.
 *
- * When alltuples = true, dump everything currently in memory.
- * (This case is only used at end of input data.)
+ * When alltuples = true, dump everything currently in memory.  (This
+ * case is only used at end of input data, although in practice only the
+ * first run could fail to dump all tuples when we LACKMEM(), and only
+ * when replacement selection is active.)
 *
- * If we empty the heap, close out the current run and return (this should
- * only happen at end of input data).  If we see that the tuple run number
- * at the top of the heap has changed, start a new run.
+ * If, when replacement selection is active, we see that the tuple run
+ * number at the top of the heap has changed, start a new run.  This must be
+ * the first run, because replacement selection is always abandoned for all
+ * further runs.
 */
 static void
 dumptuples(Tuplesortstate *state, bool alltuples)
@ -3117,46 +3235,183 @@ dumptuples(Tuplesortstate *state, bool alltuples)
 		   (LACKMEM(state) && state->memtupcount > 1) ||
 		   state->memtupcount >= state->memtupsize)
 	{
-		/*
-		 * Dump the heap's frontmost entry, and sift up to remove it from the
-		 * heap.
-		 */
-		Assert(state->memtupcount > 0);
-		WRITETUP(state, state->tp_tapenum[state->destTape],
-				 &state->memtuples[0]);
-		tuplesort_heap_siftup(state, true);
+		if (state->replaceActive)
+		{
+			/*
+			 * Still holding out for a case favorable to replacement selection.
+			 * Still incrementally spilling using heap.
+			 *
+			 * Dump the heap's frontmost entry, and sift up to remove it from
+			 * the heap.
+			 */
+			Assert(state->memtupcount > 0);
+			WRITETUP(state, state->tp_tapenum[state->destTape],
+					 &state->memtuples[0]);
+			tuplesort_heap_siftup(state, true);
+		}
+		else
+		{
+			/*
+			 * Once committed to quicksorting runs, never incrementally
+			 * spill
+			 */
+			dumpbatch(state, alltuples);
+			break;
+		}

 		/*
-		 * If the heap is empty *or* top run number has changed, we've
-		 * finished the current run.
+		 * If top run number has changed, we've finished the current run
+		 * (this can only be the first run), and will no longer spill
+		 * incrementally.
 		 */
 		if (state->memtupcount == 0 ||
-			state->currentRun != state->memtuples[0].tupindex)
+			state->memtuples[0].tupindex == HEAP_RUN_NEXT)
 		{
 			markrunend(state, state->tp_tapenum[state->destTape]);
+			Assert(state->currentRun == RUN_FIRST);
 			state->currentRun++;
 			state->tp_runs[state->destTape]++;
 			state->tp_dummy[state->destTape]--; /* per Alg D step D2 */

 #ifdef TRACE_SORT
 			if (trace_sort)
-				elog(LOG, "finished writing%s run %d to tape %d: %s",
-					 (state->memtupcount == 0) ? " final" : "",
+				elog(LOG, "finished incrementally writing %s run %d to tape %d: %s",
+					 (state->memtupcount == 0) ? "only" : "first",
 					 state->currentRun, state->destTape,
 					 pg_rusage_show(&state->ru_start));
 #endif
+			/*
+			 * Done if heap is empty, which is possible when there is only one
+			 * long run.
+			 */
+			Assert(state->currentRun == RUN_SECOND);
+			if (state->memtupcount == 0)
+			{
+				/*
+				 * Replacement selection best case; no final merge required,
+				 * because there was only one initial run (second run has no
+				 * tuples).  See RUN_SECOND case in mergeruns().
+				 */
+				break;
+			}

 			/*
-			 * Done if heap is empty, else prepare for new run.
+			 * Abandon replacement selection for second run (as well as any
+			 * subsequent runs).
 			 */
-			if (state->memtupcount == 0)
-				break;
-			Assert(state->currentRun == state->memtuples[0].tupindex);
+			state->replaceActive = false;
+
+			/*
+			 * First tuple of next run should not be heapified, and so will
+			 * bear placeholder run number.  In practice this must actually be
+			 * the second run, which just became the currentRun, so we're
+			 * clear to quicksort and dump the tuples in batch next time
+			 * memtuples becomes full.
+			 */
+			Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT);
 			selectnewtape(state);
 		}
 	}
 }

+/*
+ * dumpbatch - sort and dump all memtuples, forming one run on tape
+ *
+ * Second or subsequent runs are never heapified by this module (although
+ * heapification still respects run number differences between the first and
+ * second runs), and a heap (replacement selection priority queue) is often
+ * avoided in the first place.
+ */
+static void
+dumpbatch(Tuplesortstate *state, bool alltuples)
+{
+	int			memtupwrite;
+	int			i;
+
+	/*
+	 * Final call might require no sorting, in rare cases where we just so
+	 * happen to have previously LACKMEM()'d at the point where exactly all
+	 * remaining tuples are loaded into memory, just before input was
+	 * exhausted.
+	 *
+	 * In general, short final runs are quite possible.  Rather than
+	 * allowing a special case where there was a superfluous
+	 * selectnewtape() call (i.e. a call with no subsequent run actually
+	 * written to destTape), we prefer to write out a 0 tuple run.
+	 *
+	 * mergepreread()/mergeprereadone() are prepared for 0 tuple runs, and
+	 * will reliably mark the tape inactive for the merge when called from
+	 * beginmerge().  This case is therefore similar to the case where
+	 * mergeonerun() finds a dummy run for the tape, and so doesn't need to
+	 * merge a run from the tape (or conceptually "merges" the dummy run,
+	 * if you prefer).  According to Knuth, Algorithm D "isn't strictly
+	 * optimal" in its method of distribution and dummy run assignment;
+	 * this edge case seems very unlikely to make that appreciably worse.
+	 */
+	Assert(state->status == TSS_BUILDRUNS);
+
+	/*
+	 * It seems unlikely that this limit will ever be exceeded, but take no
+	 * chances
+	 */
+	if (state->currentRun == INT_MAX)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot have more than %d runs for an external sort",
+						INT_MAX)));
+
+	state->currentRun++;
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "starting quicksort of run %d: %s",
+			 state->currentRun, pg_rusage_show(&state->ru_start));
+#endif
+
+	/*
+	 * Sort all tuples accumulated within the allowed amount of memory for this
+	 * run using quicksort
+	 */
+	tuplesort_sort_memtuples(state);
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "finished quicksort of run %d: %s",
+			 state->currentRun, pg_rusage_show(&state->ru_start));
+#endif
+
+	memtupwrite = state->memtupcount;
+	for (i = 0; i < memtupwrite; i++)
+	{
+		WRITETUP(state, state->tp_tapenum[state->destTape],
+				 &state->memtuples[i]);
+		state->memtupcount--;
+	}
+
+	/*
+	 * Reset tuple memory.  We've freed all of the tuples that we previously
+	 * allocated.  It's important to avoid fragmentation when there is a stark
+	 * change in allocation patterns due to the use of batch memory.
+	 * Fragmentation due to AllocSetFree's bucketing by size class might be
+	 * particularly bad if this step wasn't taken.
+	 */
+	MemoryContextReset(state->tuplecontext);
+
+	markrunend(state, state->tp_tapenum[state->destTape]);
+	state->tp_runs[state->destTape]++;
+	state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
+
+#ifdef TRACE_SORT
+	if (trace_sort)
+		elog(LOG, "finished writing run %d to tape %d: %s",
+			 state->currentRun, state->destTape,
+			 pg_rusage_show(&state->ru_start));
+#endif
+
+	if (!alltuples)
+		selectnewtape(state);
+}
+
 /*
 * tuplesort_rescan		- rewind and replay the scan
 */
@ -3315,10 +3570,15 @@ tuplesort_get_stats(Tuplesortstate *state,
 *
 * Compare two SortTuples.  If checkIndex is true, use the tuple index
 * as the front of the sort key; otherwise, no.
+ *
+ * Note that for checkIndex callers, the heap invariant is never
+ * maintained beyond the first run, and so there are no COMPARETUP()
+ * calls needed to distinguish tuples in HEAP_RUN_NEXT.
 */

 #define HEAPCOMPARE(tup1,tup2) \
-	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex) ? \
+	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \
+					(tup1)->tupindex == HEAP_RUN_NEXT) ? \
 	 ((tup1)->tupindex) - ((tup2)->tupindex) : \
 	 COMPARETUP(state, tup1, tup2))

@ -3416,6 +3676,31 @@ sort_bounded_heap(Tuplesortstate *state)
 	state->boundUsed = true;
 }

+/*
+ * Sort all memtuples using specialized qsort() routines.
+ *
+ * Quicksort is used for small in-memory sorts.  Quicksort is also generally
+ * preferred to replacement selection for generating runs during external sort
+ * operations, although replacement selection is sometimes used for the first
+ * run.
+ */
+static void
+tuplesort_sort_memtuples(Tuplesortstate *state)
+{
+	if (state->memtupcount > 1)
+	{
+		/* Can we use the single-key sort function? */
+		if (state->onlyKey != NULL)
+			qsort_ssup(state->memtuples, state->memtupcount,
+					   state->onlyKey);
+		else
+			qsort_tuple(state->memtuples,
+						state->memtupcount,
+						state->comparetup,
+						state);
+	}
+}
+
 /*
 * Insert a new tuple into an empty or existing heap, maintaining the
 * heap invariant.  Caller is responsible for ensuring there's room.
@ -3443,6 +3728,7 @@ tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,

 	memtuples = state->memtuples;
 	Assert(state->memtupcount < state->memtupsize);
+	Assert(!checkIndex || tupleindex == RUN_FIRST);

 	CHECK_FOR_INTERRUPTS();

@ -3475,6 +3761,7 @@ tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex)
 	int			i,
 				n;

+	Assert(!checkIndex || state->currentRun == RUN_FIRST);
 	if (--state->memtupcount <= 0)
 		return;

--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@ -239,6 +239,7 @@ extern bool enableFsync;
 extern bool allowSystemTableMods;
 extern PGDLLIMPORT int work_mem;
 extern PGDLLIMPORT int maintenance_work_mem;
+extern PGDLLIMPORT int replacement_sort_tuples;

 extern int	VacuumCostPageHit;
 extern int	VacuumCostPageMiss;