Improve bit perturbation in TupleHashTableHash.

The changes in b81b5a96 did not fully address the issue, because the bit-mixing of the IV into the final hash-key didn't prevent clustering in the input-data survive in the output data. This didn't cause a lot of problems because of the additional growth conditions added d4c62a6b. But as we want to rein those in due to explosive growth in some edges, this needs to be fixed. Author: Andres Freund Discussion: https://postgr.es/m/20171127185700.1470.20362@wrigleys.postgresql.org Backpatch: 10, where simplehash was introduced

Improve bit perturbation in TupleHashTableHash.
The changes in b81b5a96 did not fully address the issue, because the bit-mixing of the IV into the final hash-key didn't prevent clustering in the input-data survive in the output data. This didn't cause a lot of problems because of the additional growth conditions added d4c62a6b. But as we want to rein those in due to explosive growth in some edges, this needs to be fixed. Author: Andres Freund Discussion: https://postgr.es/m/20171127185700.1470.20362@wrigleys.postgresql.org Backpatch: 10, where simplehash was introduced
c068f877 · Andres Freund · 15be2746 · c068f877 · c068f877 · c068f877
Commit c068f877 authored Jan 29, 2018 by Andres Freund
3 changed files
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -23,6 +23,7 @@
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/lsyscache.h"
+#include "utils/hashutils.h"
 #include "utils/memutils.h"
 static uint32 TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple);
@@ -326,7 +327,7 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 	 * underestimated.
 	 */
 	if (use_variable_hash_iv)
-		hashtable->hash_iv = hash_uint32(ParallelWorkerNumber);
+		hashtable->hash_iv = murmurhash32(ParallelWorkerNumber);
 	else
 		hashtable->hash_iv = 0;
@@ -510,7 +511,13 @@ TupleHashTableHash(struct tuplehash_hash *tb, const MinimalTuple tuple)
 		}
 	}
-	return hashkey;
+	/*
+	 * The way hashes are combined above, among each other and with the IV,
+	 * doesn't lead to good bit perturbation. As the IV's goal is to lead to
+	 * achieve that, perform a round of hashing of the combined hash -
+	 * resulting in near perfect perturbation.
+	 */
+	return murmurhash32(hashkey);
 }
 /*

--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1183,29 +1183,33 @@ explain (costs off)
 -- simple rescan tests
 select a, b, sum(v.x)
  from (values (1),(2)) v(x), gstest_data(v.x)
- group by grouping sets (a,b);
+ group by grouping sets (a,b)
+ order by 1, 2, 3;
 a | b | sum 
 ---+---+-----
- 2 |   |   6
 1 |   |   3
+ 2 |   |   6
+   | 1 |   3
   | 2 |   3
   | 3 |   3
-   | 1 |   3
 (5 rows)
 explain (costs off)
  select a, b, sum(v.x)
    from (values (1),(2)) v(x), gstest_data(v.x)
-   group by grouping sets (a,b);
+   group by grouping sets (a,b)
-                QUERY PLAN                
+   order by 3, 1, 2;
------------------------------------------
+                             QUERY PLAN                              
- HashAggregate
+---------------------------------------------------------------------
-   Hash Key: gstest_data.a
+ Sort
-   Hash Key: gstest_data.b
+   Sort Key: (sum("*VALUES*".column1)), gstest_data.a, gstest_data.b
-   ->  Nested Loop
+   ->  HashAggregate
-         ->  Values Scan on "*VALUES*"
+         Hash Key: gstest_data.a
-         ->  Function Scan on gstest_data
+         Hash Key: gstest_data.b
-(6 rows)
+         ->  Nested Loop
+               ->  Values Scan on "*VALUES*"
+               ->  Function Scan on gstest_data
+(8 rows)
 select *
  from (values (1),(2)) v(x),

--- a/src/test/regress/sql/groupingsets.sql
+++ b/src/test/regress/sql/groupingsets.sql
@@ -342,12 +342,13 @@ explain (costs off)
 select a, b, sum(v.x)
  from (values (1),(2)) v(x), gstest_data(v.x)
- group by grouping sets (a,b);
+ group by grouping sets (a,b)
+ order by 1, 2, 3;
 explain (costs off)
  select a, b, sum(v.x)
    from (values (1),(2)) v(x), gstest_data(v.x)
-   group by grouping sets (a,b);
+   group by grouping sets (a,b)
+   order by 3, 1, 2;
 select *
  from (values (1),(2)) v(x),
       lateral (select a, b, sum(v.x) from gstest_data(v.x) group by grouping sets (a,b)) s;