Commit bed9ef5a authored by Alvaro Herrera's avatar Alvaro Herrera

Rework the stats_ext test

As suggested by Tom Lane, avoid printing specific estimated cost values,
because they vary across architectures; instead, verify plan shapes (in
this case, HashAggregate vs. GroupAggregate), as we do in other planner
tests.

We can now remove expected/stats_ext_1.out.

Author: Tomas Vondra
parent 70ec3f1f
-- Generic extended statistics support
-- We will be checking execution plans without/with statistics, so
-- let's make sure we get simple non-parallel plans. Also set the
-- work_mem low so that we can use small amounts of data.
SET max_parallel_workers = 0;
SET max_parallel_workers_per_gather = 0;
SET work_mem = '128kB';
-- Ensure stats are dropped sanely
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
......@@ -49,6 +55,67 @@ CREATE TABLE ndistinct (
c INT,
d INT
);
-- over-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1)
SELECT i/100, i/100, i/100, cash_words((i/100)::money)
FROM generate_series(1,30000) s(i);
ANALYZE ndistinct;
-- Group Aggregate, due to over-estimate of the number of groups
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: a, b
-> Sort
Sort Key: a, b
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: b, c
-> Sort
Sort Key: b, c
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: a, b, c
-> Sort
Sort Key: a, b, c
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: a, b, c, d
-> Sort
Sort Key: a, b, c, d
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: b, c, d
-> Sort
Sort Key: b, c, d
-> Seq Scan on ndistinct
(5 rows)
-- unknown column
CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
ERROR: column "unknown_column" referenced in statistics does not exist
......@@ -63,18 +130,15 @@ CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
ERROR: duplicate column name in statistics definition
-- correct command
CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
-- perfectly correlated groups
INSERT INTO ndistinct (a, b, c, filler1)
SELECT i/100, i/100, i/100, cash_words(i::money)
FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
staenabled | standistinct
------------+------------------------------------------------------------------------------------------------
{d} | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}]
{d} | [{(b 3 4), 301.000000}, {(b 3 6), 301.000000}, {(b 4 6), 301.000000}, {(b 3 4 6), 301.000000}]
(1 row)
-- Hash Aggregate, thanks to estimates improved by the statistic
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
QUERY PLAN
......@@ -85,78 +149,165 @@ EXPLAIN (COSTS off)
(3 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b, c
Group Key: b, c
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b, c, d
Group Key: a, b, c
-> Seq Scan on ndistinct
(3 rows)
-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: a, b, c, d
-> Sort
Sort Key: a, b, c, d
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: b, c, d
-> Sort
Sort Key: b, c, d
-> Seq Scan on ndistinct
(5 rows)
TRUNCATE TABLE ndistinct;
-- partially correlated groups
INSERT INTO ndistinct (a, b, c)
SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
-- under-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1)
SELECT mod(i,50), mod(i,51), mod(i,32),
cash_words(mod(i,33)::int::money)
FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
staenabled | standistinct
------------+------------------------------------------------------------------------------------------------
{d} | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}]
staenabled | standistinct
------------+----------------------------------------------------------------------------------------------------
{d} | [{(b 3 4), 2550.000000}, {(b 3 6), 800.000000}, {(b 4 6), 1632.000000}, {(b 3 4 6), 10000.000000}]
(1 row)
EXPLAIN
-- plans using Group Aggregate, thanks to using correct esimates
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
QUERY PLAN
---------------------------------------------------------------------
HashAggregate (cost=230.00..232.01 rows=201 width=16)
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: a, b
-> Sort
Sort Key: a, b
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: a, b, c
-> Sort
Sort Key: a, b, c
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
QUERY PLAN
-----------------------------------
GroupAggregate
Group Key: a, b, c, d
-> Sort
Sort Key: a, b, c, d
-> Seq Scan on ndistinct
(5 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: b, c, d
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, d
-> Seq Scan on ndistinct
(3 rows)
DROP STATISTICS s10;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
staenabled | standistinct
------------+--------------
(0 rows)
-- dropping the statistics switches the plans to Hash Aggregate,
-- due to under-estimates
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b
-> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8)
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
QUERY PLAN
----------------------------------------------------------------------
HashAggregate (cost=255.00..257.01 rows=201 width=20)
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b, c
-> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12)
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
QUERY PLAN
----------------------------------------------------------------------
HashAggregate (cost=280.00..290.00 rows=1000 width=24)
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b, c, d
-> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=16)
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
QUERY PLAN
----------------------------------------------------------------------
HashAggregate (cost=255.00..265.00 rows=1000 width=20)
QUERY PLAN
-----------------------------
HashAggregate
Group Key: b, c, d
-> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=12)
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
QUERY PLAN
---------------------------------------------------------------------
HashAggregate (cost=230.00..240.00 rows=1000 width=16)
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, d
-> Seq Scan on ndistinct (cost=0.00..155.00 rows=10000 width=8)
-> Seq Scan on ndistinct
(3 rows)
DROP TABLE ndistinct;
-- Generic extended statistics support
-- Ensure stats are dropped sanely
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
DROP STATISTICS ab1_a_b_stats;
CREATE SCHEMA regress_schema_2;
CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1;
DROP STATISTICS regress_schema_2.ab1_a_b_stats;
-- Ensure statistics are dropped when columns are
CREATE STATISTICS ab1_b_c_stats ON (b, c) FROM ab1;
CREATE STATISTICS ab1_a_b_c_stats ON (a, b, c) FROM ab1;
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
ALTER TABLE ab1 DROP COLUMN a;
\d ab1
Table "public.ab1"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
b | integer | | |
c | integer | | |
Statistics:
"public.ab1_b_c_stats" WITH (ndistinct) ON (b, c)
DROP TABLE ab1;
-- Ensure things work sanely with SET STATISTICS 0
CREATE TABLE ab1 (a INTEGER, b INTEGER);
ALTER TABLE ab1 ALTER a SET STATISTICS 0;
INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a;
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
ANALYZE ab1;
ERROR: extended statistics could not be collected for column "a" of relation public.ab1
HINT: Consider ALTER TABLE "public"."ab1" ALTER "a" SET STATISTICS -1
ALTER TABLE ab1 ALTER a SET STATISTICS -1;
ANALYZE ab1;
DROP TABLE ab1;
-- n-distinct tests
CREATE TABLE ndistinct (
filler1 TEXT,
filler2 NUMERIC,
a INT,
b INT,
filler3 DATE,
c INT,
d INT
);
-- unknown column
CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
ERROR: column "unknown_column" referenced in statistics does not exist
-- single column
CREATE STATISTICS s10 ON (a) FROM ndistinct;
ERROR: statistics require at least 2 columns
-- single column, duplicated
CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
ERROR: duplicate column name in statistics definition
-- two columns, one duplicated
CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
ERROR: duplicate column name in statistics definition
-- correct command
CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
-- perfectly correlated groups
INSERT INTO ndistinct (a, b, c, filler1)
SELECT i/100, i/100, i/100, cash_words(i::money)
FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
staenabled | standistinct
------------+------------------------------------------------------------------------------------------------
{d} | [{(b 3 4), 101.000000}, {(b 3 6), 101.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 101.000000}]
(1 row)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b, c
-> Seq Scan on ndistinct
(3 rows)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
QUERY PLAN
-----------------------------
HashAggregate
Group Key: a, b, c, d
-> Seq Scan on ndistinct
(3 rows)
TRUNCATE TABLE ndistinct;
-- partially correlated groups
INSERT INTO ndistinct (a, b, c)
SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
staenabled | standistinct
------------+------------------------------------------------------------------------------------------------
{d} | [{(b 3 4), 201.000000}, {(b 3 6), 201.000000}, {(b 4 6), 101.000000}, {(b 3 4 6), 201.000000}]
(1 row)
EXPLAIN
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
QUERY PLAN
---------------------------------------------------------------------
HashAggregate (cost=225.00..227.01 rows=201 width=16)
Group Key: a, b
-> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=8)
(3 rows)
EXPLAIN
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
QUERY PLAN
----------------------------------------------------------------------
HashAggregate (cost=250.00..252.01 rows=201 width=20)
Group Key: a, b, c
-> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=12)
(3 rows)
EXPLAIN
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
QUERY PLAN
----------------------------------------------------------------------
HashAggregate (cost=275.00..285.00 rows=1000 width=24)
Group Key: a, b, c, d
-> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=16)
(3 rows)
EXPLAIN
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
QUERY PLAN
----------------------------------------------------------------------
HashAggregate (cost=250.00..260.00 rows=1000 width=20)
Group Key: b, c, d
-> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=12)
(3 rows)
EXPLAIN
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
QUERY PLAN
---------------------------------------------------------------------
HashAggregate (cost=225.00..235.00 rows=1000 width=16)
Group Key: a, d
-> Seq Scan on ndistinct (cost=0.00..150.00 rows=10000 width=8)
(3 rows)
DROP TABLE ndistinct;
-- Generic extended statistics support
-- We will be checking execution plans without/with statistics, so
-- let's make sure we get simple non-parallel plans. Also set the
-- work_mem low so that we can use small amounts of data.
SET max_parallel_workers = 0;
SET max_parallel_workers_per_gather = 0;
SET work_mem = '128kB';
-- Ensure stats are dropped sanely
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
......@@ -43,6 +50,29 @@ CREATE TABLE ndistinct (
d INT
);
-- over-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1)
SELECT i/100, i/100, i/100, cash_words((i/100)::money)
FROM generate_series(1,30000) s(i);
ANALYZE ndistinct;
-- Group Aggregate, due to over-estimate of the number of groups
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-- unknown column
CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
......@@ -58,9 +88,35 @@ CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
-- correct command
CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
-- perfectly correlated groups
ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
-- Hash Aggregate, thanks to estimates improved by the statistic
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
TRUNCATE TABLE ndistinct;
-- under-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1)
SELECT i/100, i/100, i/100, cash_words(i::money)
SELECT mod(i,50), mod(i,51), mod(i,32),
cash_words(mod(i,33)::int::money)
FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
......@@ -68,6 +124,7 @@ ANALYZE ndistinct;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
-- plans using Group Aggregate, thanks to using correct esimates
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
......@@ -77,30 +134,32 @@ EXPLAIN (COSTS off)
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
TRUNCATE TABLE ndistinct;
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-- partially correlated groups
INSERT INTO ndistinct (a, b, c)
SELECT i/50, i/100, i/200 FROM generate_series(1,10000) s(i);
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
ANALYZE ndistinct;
DROP STATISTICS s10;
SELECT staenabled, standistinct
FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
EXPLAIN
-- dropping the statistics switches the plans to Hash Aggregate,
-- due to under-estimates
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
EXPLAIN
EXPLAIN (COSTS off)
SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
DROP TABLE ndistinct;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment