Fix parser so that we don't modify the user-written ORDER BY list in order

to represent DISTINCT or DISTINCT ON. This gets rid of a longstanding annoyance that a view or rule using SELECT DISTINCT will be dumped out with an overspecified ORDER BY list, and is one small step along the way to decoupling DISTINCT and ORDER BY enough so that hash-based implementation of DISTINCT will be possible. In passing, improve transformDistinctClause so that it doesn't reject duplicate DISTINCT ON items, as was reported by Steve Midgley a couple weeks ago.
2008-07-31 22:47:56 +00:00 · 2008-07-31 22:47:56 +00:00 · 63247bec28
parent b1fb3b2a7f
commit 63247bec28
7 changed files with 174 additions and 125 deletions
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@ -14,7 +14,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.106 2008/01/11 04:02:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.107 2008/07/31 22:47:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -288,7 +288,7 @@ query_planner(PlannerInfo *root, List *tlist,
 		 * levels of sort --- and, therefore, certainly need to read all the
 		 * tuples --- unless ORDER BY is a subset of GROUP BY.
 		 */
-		if (parse->groupClause && parse->sortClause &&
+		if (root->group_pathkeys && root->sort_pathkeys &&
 			!pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
 			tuple_fraction = 0.0;
 	}
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.234 2008/07/10 02:14:03 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.235 2008/07/31 22:47:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -826,6 +826,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		/*
 		 * Calculate pathkeys that represent result ordering requirements
 		 */
+		Assert(parse->distinctClause == NIL);
 		sort_pathkeys = make_pathkeys_for_sortclauses(root,
 													  parse->sortClause,
 													  tlist,
@ -864,17 +865,29 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 * Calculate pathkeys that represent grouping/ordering requirements.
 		 * Stash them in PlannerInfo so that query_planner can canonicalize
 		 * them after EquivalenceClasses have been formed.
+		 *
+		 * Note: for the moment, DISTINCT is always implemented via sort/uniq,
+		 * and we set the sort_pathkeys to be the more rigorous of the
+		 * DISTINCT and ORDER BY requirements.  This should be changed
+		 * someday, but DISTINCT ON is a bit of a problem ...
 		 */
 		root->group_pathkeys =
 			make_pathkeys_for_sortclauses(root,
 										  parse->groupClause,
 										  tlist,
 										  false);
-		root->sort_pathkeys =
-			make_pathkeys_for_sortclauses(root,
-										  parse->sortClause,
-										  tlist,
-										  false);
+		if (list_length(parse->distinctClause) > list_length(parse->sortClause))
+			root->sort_pathkeys =
+				make_pathkeys_for_sortclauses(root,
+											  parse->distinctClause,
+											  tlist,
+											  false);
+		else
+			root->sort_pathkeys =
+				make_pathkeys_for_sortclauses(root,
+											  parse->sortClause,
+											  tlist,
+											  false);

 		/*
 		 * Will need actual number of aggregates for estimating costs.
@ -903,9 +916,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 * by ORDER BY --- but that might just leave us failing to exploit an
 		 * available sort order at all. Needs more thought...)
 		 */
-		if (parse->groupClause)
+		if (root->group_pathkeys)
 			root->query_pathkeys = root->group_pathkeys;
-		else if (parse->sortClause)
+		else if (root->sort_pathkeys)
 			root->query_pathkeys = root->sort_pathkeys;
 		else
 			root->query_pathkeys = NIL;
@ -1172,7 +1185,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 	 * If we were not able to make the plan come out in the right order, add
 	 * an explicit sort step.
 	 */
-	if (parse->sortClause)
+	if (sort_pathkeys)
 	{
 		if (!pathkeys_contained_in(sort_pathkeys, current_pathkeys))
 		{
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@ -22,7 +22,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.147 2008/06/19 00:46:04 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/prep/prepunion.c,v 1.148 2008/07/31 22:47:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -69,6 +69,7 @@ static List *generate_setop_tlist(List *colTypes, int flag,
 static List *generate_append_tlist(List *colTypes, bool flag,
 					  List *input_plans,
 					  List *refnames_tlist);
+static List *generate_setop_sortlist(List *targetlist);
 static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte,
 						 Index rti);
 static void make_inh_translation_lists(Relation oldrelation,
@ -319,7 +320,7 @@ generate_union_plan(SetOperationStmt *op, PlannerInfo *root,
 	{
 		List	   *sortList;

-		sortList = addAllTargetsToSortList(NULL, NIL, tlist, false);
+		sortList = generate_setop_sortlist(tlist);
 		if (sortList)
 		{
 			plan = (Plan *) make_sort_from_sortclauses(root, sortList, plan);
@ -384,7 +385,7 @@ generate_nonunion_plan(SetOperationStmt *op, PlannerInfo *root,
 	 * Sort the child results, then add a SetOp plan node to generate the
 	 * correct output.
 	 */
-	sortList = addAllTargetsToSortList(NULL, NIL, tlist, false);
+	sortList = generate_setop_sortlist(tlist);

 	if (sortList == NIL)		/* nothing to sort on? */
 	{
@ -675,6 +676,31 @@ generate_append_tlist(List *colTypes, bool flag,
 	return tlist;
 }

+/*
+ * generate_setop_sortlist
+ *		Build a SortClause list enumerating all the non-resjunk tlist entries,
+ *		using default ordering properties.
+ */
+static List *
+generate_setop_sortlist(List *targetlist)
+{
+	List	   *sortlist = NIL;
+	ListCell   *l;
+
+	foreach(l, targetlist)
+	{
+		TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+		if (!tle->resjunk)
+			sortlist = addTargetToSortList(NULL, tle,
+										   sortlist, targetlist,
+										   SORTBY_DEFAULT,
+										   SORTBY_NULLS_DEFAULT,
+										   NIL, false);
+	}
+	return sortlist;
+}
+

 /*
 * find_all_inheritors -
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@ -17,7 +17,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- *	$PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.373 2008/07/18 20:26:06 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/parser/analyze.c,v 1.374 2008/07/31 22:47:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -711,6 +711,8 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	/*
 	 * Transform sorting/grouping stuff.  Do ORDER BY first because both
 	 * transformGroupClause and transformDistinctClause need the results.
+	 * Note that these functions can also change the targetList, so it's
+	 * passed to them by reference.
 	 */
 	qry->sortClause = transformSortClause(pstate,
 										  stmt->sortClause,
@ -725,8 +727,9 @@ transformSelectStmt(ParseState *pstate, SelectStmt *stmt)
 	qry->distinctClause = transformDistinctClause(pstate,
 												  stmt->distinctClause,
 												  &qry->targetList,
-												  &qry->sortClause);
+												  qry->sortClause);

+	/* transform LIMIT */
 	qry->limitOffset = transformLimitClause(pstate, stmt->limitOffset,
 											"OFFSET");
 	qry->limitCount = transformLimitClause(pstate, stmt->limitCount,
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.170 2008/06/19 00:46:05 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/parser/parse_clause.c,v 1.171 2008/07/31 22:47:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1454,16 +1454,23 @@ transformSortClause(ParseState *pstate,
 * transformDistinctClause -
 *	  transform a DISTINCT or DISTINCT ON clause
 *
- * Since we may need to add items to the query's sortClause list, that list
- * is passed by reference.	Likewise for the targetlist.
+ * Since we may need to add items to the query's targetlist, that list
+ * is passed by reference.
+ *
+ * As with GROUP BY, we absorb the sorting semantics of ORDER BY as much as
+ * possible into the distinctClause.  This avoids a possible need to re-sort,
+ * and allows the user to determine the equality semantics used by DISTINCT,
+ * should she be working with a datatype that has more than one btree equality
+ * operator.
 */
 List *
 transformDistinctClause(ParseState *pstate, List *distinctlist,
-						List **targetlist, List **sortClause)
+						List **targetlist, List *sortClause)
 {
 	List	   *result = NIL;
 	ListCell   *slitem;
 	ListCell   *dlitem;
+	ListCell   *tlitem;

 	/* No work if there was no DISTINCT clause */
 	if (distinctlist == NIL)
@ -1474,25 +1481,21 @@ transformDistinctClause(ParseState *pstate, List *distinctlist,
 		/* We had SELECT DISTINCT */

 		/*
-		 * All non-resjunk elements from target list that are not already in
-		 * the sort list should be added to it.  (We don't really care what
-		 * order the DISTINCT fields are checked in, so we can leave the
-		 * user's ORDER BY spec alone, and just add additional sort keys to it
-		 * to ensure that all targetlist items get sorted.)
+		 * The distinctClause should consist of all ORDER BY items followed
+		 * by all other non-resjunk targetlist items.  There must not be any
+		 * resjunk ORDER BY items --- that would imply that we are sorting
+		 * by a value that isn't necessarily unique within a DISTINCT group,
+		 * so the results wouldn't be well-defined.  This construction
+		 * ensures we follow the rule that sortClause and distinctClause match;
+		 * in fact the sortClause will always be a prefix of distinctClause.
+		 *
+		 * Note a corner case: the same TLE could be in the ORDER BY list
+		 * multiple times with different sortops.  We have to include it in
+		 * the distinctClause the same way to preserve the prefix property.
+		 * The net effect will be that the TLE value will be made unique
+		 * according to both sortops.
 		 */
-		*sortClause = addAllTargetsToSortList(pstate,
-											  *sortClause,
-											  *targetlist,
-											  true);
-
-		/*
-		 * Now, DISTINCT list consists of all non-resjunk sortlist items.
-		 * Actually, all the sortlist items had better be non-resjunk!
-		 * Otherwise, user wrote SELECT DISTINCT with an ORDER BY item that
-		 * does not appear anywhere in the SELECT targetlist, and we can't
-		 * implement that with only one sorting pass...
-		 */
-		foreach(slitem, *sortClause)
+		foreach(slitem, sortClause)
 		{
 			SortClause *scl = (SortClause *) lfirst(slitem);
 			TargetEntry *tle = get_sortgroupclause_tle(scl, *targetlist);
@ -1504,110 +1507,112 @@ transformDistinctClause(ParseState *pstate, List *distinctlist,
 			else
 				result = lappend(result, copyObject(scl));
 		}
+
+		/*
+		 * Now add any remaining non-resjunk tlist items, using default
+		 * sorting semantics for their data types.
+		 */
+		foreach(tlitem, *targetlist)
+		{
+			TargetEntry *tle = (TargetEntry *) lfirst(tlitem);
+
+			if (tle->resjunk)
+				continue;		/* ignore junk */
+			if (targetIsInSortList(tle, InvalidOid, result))
+				continue;		/* already in list (with some semantics) */
+			result = addTargetToSortList(pstate, tle,
+										 result, *targetlist,
+										 SORTBY_DEFAULT,
+										 SORTBY_NULLS_DEFAULT,
+										 NIL, true);
+		}
 	}
 	else
 	{
 		/* We had SELECT DISTINCT ON (expr, ...) */
+		Bitmapset  *refnos = NULL;
+		int			sortgroupref;
+		bool		skipped_sortitem;

 		/*
-		 * If the user writes both DISTINCT ON and ORDER BY, then the two
-		 * expression lists must match (until one or the other runs out).
-		 * Otherwise the ORDER BY requires a different sort order than the
-		 * DISTINCT does, and we can't implement that with only one sort pass
-		 * (and if we do two passes, the results will be rather
-		 * unpredictable). However, it's OK to have more DISTINCT ON
-		 * expressions than ORDER BY expressions; we can just add the extra
-		 * DISTINCT values to the sort list, much as we did above for ordinary
-		 * DISTINCT fields.
-		 *
-		 * Actually, it'd be OK for the common prefixes of the two lists to
-		 * match in any order, but implementing that check seems like more
-		 * trouble than it's worth.
+		 * Add all the DISTINCT ON expressions to the tlist (if not already
+		 * present, they are added as resjunk items).  Assign sortgroupref
+		 * numbers to them, and form a bitmapset of these numbers.  (A
+		 * bitmapset is convenient here because we don't care about order
+		 * and we can discard duplicates.)
 		 */
-		ListCell   *nextsortlist = list_head(*sortClause);
-
 		foreach(dlitem, distinctlist)
 		{
+			Node   *dexpr = (Node *) lfirst(dlitem);
 			TargetEntry *tle;

-			tle = findTargetlistEntry(pstate, lfirst(dlitem),
+			tle = findTargetlistEntry(pstate, dexpr,
 									  targetlist, DISTINCT_ON_CLAUSE);
+			sortgroupref = assignSortGroupRef(tle, *targetlist);
+			refnos = bms_add_member(refnos, sortgroupref);
+		}

-			if (nextsortlist != NULL)
+		/*
+		 * If the user writes both DISTINCT ON and ORDER BY, adopt the
+		 * sorting semantics from ORDER BY items that match DISTINCT ON
+		 * items, and also adopt their column sort order.  We insist that
+		 * the distinctClause and sortClause match, so throw error if we
+		 * find the need to add any more distinctClause items after we've
+		 * skipped an ORDER BY item that wasn't in DISTINCT ON.
+		 */
+		skipped_sortitem = false;
+		foreach(slitem, sortClause)
+		{
+			SortClause *scl = (SortClause *) lfirst(slitem);
+
+			if (bms_is_member(scl->tleSortGroupRef, refnos))
 			{
-				SortClause *scl = (SortClause *) lfirst(nextsortlist);
-
-				if (tle->ressortgroupref != scl->tleSortGroupRef)
+				if (skipped_sortitem)
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
 							 errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions")));
-				result = lappend(result, copyObject(scl));
-				nextsortlist = lnext(nextsortlist);
+				else
+					result = lappend(result, copyObject(scl));
 			}
 			else
 			{
-				*sortClause = addTargetToSortList(pstate, tle,
-												  *sortClause, *targetlist,
-												  SORTBY_DEFAULT,
-												  SORTBY_NULLS_DEFAULT,
-												  NIL, true);
-
-				/*
-				 * Probably, the tle should always have been added at the end
-				 * of the sort list ... but search to be safe.
-				 */
-				foreach(slitem, *sortClause)
-				{
-					SortClause *scl = (SortClause *) lfirst(slitem);
-
-					if (tle->ressortgroupref == scl->tleSortGroupRef)
-					{
-						result = lappend(result, copyObject(scl));
-						break;
-					}
-				}
-				if (slitem == NULL)		/* should not happen */
-					elog(ERROR, "failed to add DISTINCT ON clause to target list");
+				skipped_sortitem = true;
 			}
 		}
+
+		/*
+		 * Now add any remaining DISTINCT ON items, using default sorting
+		 * semantics for their data types.  (Note: this is pretty
+		 * questionable; if the ORDER BY list doesn't include all the DISTINCT
+		 * ON items and more besides, you certainly aren't using DISTINCT ON
+		 * in the intended way, and you probably aren't going to get
+		 * consistent results.  It might be better to throw an error or warning
+		 * here.  But historically we've allowed it, so keep doing so.)
+		 */
+		while ((sortgroupref = bms_first_member(refnos)) >= 0)
+		{
+			TargetEntry *tle = get_sortgroupref_tle(sortgroupref, *targetlist);
+
+			if (targetIsInSortList(tle, InvalidOid, result))
+				continue;		/* already in list (with some semantics) */
+			if (skipped_sortitem)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+						 errmsg("SELECT DISTINCT ON expressions must match initial ORDER BY expressions")));
+			result = addTargetToSortList(pstate, tle,
+										 result, *targetlist,
+										 SORTBY_DEFAULT,
+										 SORTBY_NULLS_DEFAULT,
+										 NIL, true);
+		}
 	}

 	return result;
 }

-/*
- * addAllTargetsToSortList
- *		Make sure all non-resjunk targets in the targetlist are in the
- *		ORDER BY list, adding the not-yet-sorted ones to the end of the list.
- *		This is typically used to help implement SELECT DISTINCT.
- *
- * See addTargetToSortList for info about pstate and resolveUnknown inputs.
- *
- * Returns the updated ORDER BY list.
- */
-List *
-addAllTargetsToSortList(ParseState *pstate, List *sortlist,
-						List *targetlist, bool resolveUnknown)
-{
-	ListCell   *l;
-
-	foreach(l, targetlist)
-	{
-		TargetEntry *tle = (TargetEntry *) lfirst(l);
-
-		if (!tle->resjunk)
-			sortlist = addTargetToSortList(pstate, tle,
-										   sortlist, targetlist,
-										   SORTBY_DEFAULT,
-										   SORTBY_NULLS_DEFAULT,
-										   NIL, resolveUnknown);
-	}
-	return sortlist;
-}
-
 /*
 * addTargetToSortList
- *		If the given targetlist entry isn't already in the ORDER BY list,
+ *		If the given targetlist entry isn't already in the SortClause list,
 *		add it to the end of the list, using the given sort ordering info.
 *
 * If resolveUnknown is TRUE, convert TLEs of type UNKNOWN to TEXT.  If not,
@ -1615,7 +1620,7 @@ addAllTargetsToSortList(ParseState *pstate, List *sortlist,
 * pstate should be provided if resolveUnknown is TRUE, but can be NULL
 * otherwise.
 *
- * Returns the updated ORDER BY list.
+ * Returns the updated SortClause list.
 */
 List *
 addTargetToSortList(ParseState *pstate, TargetEntry *tle,
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.368 2008/07/18 03:32:53 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/parsenodes.h,v 1.369 2008/07/31 22:47:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -613,14 +613,19 @@ typedef struct RangeTblEntry
 * tleSortGroupRef must match ressortgroupref of exactly one entry of the
 * associated targetlist; that is the expression to be sorted (or grouped) by.
 * sortop is the OID of the ordering operator (a "<" or ">" operator).
- * nulls_first does about what you'd expect.
+ * nulls_first means about what you'd expect.
 *
 * SortClauses are also used to identify targets that we will do a "Unique"
 * filter step on (for SELECT DISTINCT and SELECT DISTINCT ON).  The
- * distinctClause list is simply a copy of the relevant members of the
- * sortClause list.  Note that distinctClause can be a subset of sortClause,
- * but cannot have members not present in sortClause; and the members that
- * do appear must be in the same order as in sortClause.
+ * distinctClause list is a list of SortClauses for the expressions to be
+ * unique-ified.  (As per comment for GroupClause, this overspecifies the
+ * semantics.)  In SELECT DISTINCT, the distinctClause list is typically
+ * longer than the ORDER BY list, while in SELECT DISTINCT ON it's typically
+ * shorter.  The two lists must match up to the end of the shorter one ---
+ * the parser rearranges the distinctClause if necessary to make this true.
+ * (This restriction ensures that only one sort step is needed to both
+ * satisfy the ORDER BY and set up for the Unique step.  This is semantically
+ * necessary for DISTINCT ON, and offers no real drawback for DISTINCT.)
 */
 typedef struct SortClause
 {
--- a/src/include/parser/parse_clause.h
+++ b/src/include/parser/parse_clause.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/parser/parse_clause.h,v 1.49 2008/01/01 19:45:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/parser/parse_clause.h,v 1.50 2008/07/31 22:47:56 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -31,11 +31,8 @@ extern List *transformGroupClause(ParseState *pstate, List *grouplist,
 extern List *transformSortClause(ParseState *pstate, List *orderlist,
 					List **targetlist, bool resolveUnknown);
 extern List *transformDistinctClause(ParseState *pstate, List *distinctlist,
-						List **targetlist, List **sortClause);
+						List **targetlist, List *sortClause);

-extern List *addAllTargetsToSortList(ParseState *pstate,
-						List *sortlist, List *targetlist,
-						bool resolveUnknown);
 extern List *addTargetToSortList(ParseState *pstate, TargetEntry *tle,
 					List *sortlist, List *targetlist,
 					SortByDir sortby_dir, SortByNulls sortby_nulls,