Improve tuplesort.c to support variable merge order. The original coding

with fixed merge order (fixed number of "tapes") was based on obsolete assumptions, namely that tape drives are expensive. Since our "tapes" are really just a couple of buffers, we can have a lot of them given adequate workspace. This allows reduction of the number of merge passes with consequent savings of I/O during large sorts. Simon Riggs with some rework by Tom Lane

Improve tuplesort.c to support variable merge order. The original coding
with fixed merge order (fixed number of "tapes") was based on obsolete assumptions, namely that tape drives are expensive. Since our "tapes" are really just a couple of buffers, we can have a lot of them given adequate workspace. This allows reduction of the number of merge passes with consequent savings of I/O during large sorts. Simon Riggs with some rework by Tom Lane
df700e6b · Tom Lane · 85c0eac1 · df700e6b · df700e6b · df700e6b
Commit df700e6b authored Feb 19, 2006 by Tom Lane
3 changed files
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -49,7 +49,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.153 2006/02/05 02:59:16 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.154 2006/02/19 05:54:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -70,10 +70,10 @@
 #include "utils/selfuncs.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
+#include "utils/tuplesort.h"
 #define LOG2(x)  (log(x) / 0.693147180559945)
-#define LOG6(x)  (log(x) / 1.79175946922805)
 /*
 * Some Paths return less than the nominal number of rows of their parent
@@ -767,11 +767,10 @@ cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
 * If the total volume exceeds work_mem, we switch to a tape-style merge
 * algorithm.  There will still be about t*log2(t) tuple comparisons in
 * total, but we will also need to write and read each tuple once per
- * merge pass.	We expect about ceil(log6(r)) merge passes where r is the
+ * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
- * number of initial runs formed (log6 because tuplesort.c uses six-tape
+ * number of initial runs formed and M is the merge order used by tuplesort.c.
- * merging).  Since the average initial run should be about twice work_mem,
+ * Since the average initial run should be about twice work_mem, we have
- * we have
+ *		disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem)))
- *		disk traffic = 2 * relsize * ceil(log6(p / (2*work_mem)))
 *		cpu = comparison_cost * t * log2(t)
 *
 * The disk traffic is assumed to be half sequential and half random
@@ -824,10 +823,14 @@ cost_sort(Path *path, PlannerInfo *root,
 	{
 		double		npages = ceil(nbytes / BLCKSZ);
 		double		nruns = (nbytes / work_mem_bytes) * 0.5;
-		double		log_runs = ceil(LOG6(nruns));
+		double		mergeorder = tuplesort_merge_order(work_mem_bytes);
+		double		log_runs;
 		double		npageaccesses;
-		if (log_runs < 1.0)
+		/* Compute logM(r) as log(r) / log(M) */
+		if (nruns > mergeorder)
+			log_runs = ceil(log(nruns) / log(mergeorder));
+		else
 			log_runs = 1.0;
 		npageaccesses = 2.0 * npages * log_runs;
 		/* Assume half are sequential (cost 1), half are not */

--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -13,7 +13,7 @@
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.17 2004/12/31 22:03:46 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.18 2006/02/19 05:54:06 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -67,6 +67,8 @@ extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
 extern void tuplesort_end(Tuplesortstate *state);
+extern int	tuplesort_merge_order(long allowedMem);
 /*
 * These routines may only be called if randomAccess was specified 'true'.
 * Likewise, backwards scan in gettuple/getdatum is only allowed if