-- Generic extended statistics support -- We will be checking execution plans without/with statistics, so -- let's make sure we get simple non-parallel plans. Also set the -- work_mem low so that we can use small amounts of data. SET max_parallel_workers = 0; SET max_parallel_workers_per_gather = 0; SET work_mem = '128kB'; -- Ensure stats are dropped sanely CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER); CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; DROP STATISTICS ab1_a_b_stats; CREATE SCHEMA regress_schema_2; CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1; -- Let's also verify the pg_get_statisticsextdef output looks sane. SELECT pg_get_statisticsextdef(oid) FROM pg_statistic_ext WHERE staname = 'ab1_a_b_stats'; pg_get_statisticsextdef --------------------------------------------------------------------- CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1 (1 row) DROP STATISTICS regress_schema_2.ab1_a_b_stats; -- Ensure statistics are dropped when columns are CREATE STATISTICS ab1_b_c_stats ON (b, c) FROM ab1; CREATE STATISTICS ab1_a_b_c_stats ON (a, b, c) FROM ab1; CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; ALTER TABLE ab1 DROP COLUMN a; \d ab1 Table "public.ab1" Column | Type | Collation | Nullable | Default --------+---------+-----------+----------+--------- b | integer | | | c | integer | | | Statistics: "public.ab1_b_c_stats" WITH (ndistinct) ON (b, c) DROP TABLE ab1; -- Ensure things work sanely with SET STATISTICS 0 CREATE TABLE ab1 (a INTEGER, b INTEGER); ALTER TABLE ab1 ALTER a SET STATISTICS 0; INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a; CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1; ANALYZE ab1; ERROR: extended statistics could not be collected for column "a" of relation public.ab1 HINT: Consider ALTER TABLE "public"."ab1" ALTER "a" SET STATISTICS -1 ALTER TABLE ab1 ALTER a SET STATISTICS -1; ANALYZE ab1; DROP TABLE ab1; -- n-distinct tests CREATE TABLE ndistinct ( filler1 TEXT, filler2 NUMERIC, a INT, b INT, filler3 DATE, c INT, d INT ); -- over-estimates when using only per-column statistics INSERT INTO ndistinct (a, b, c, filler1) SELECT i/100, i/100, i/100, cash_words((i/100)::money) FROM generate_series(1,30000) s(i); ANALYZE ndistinct; -- Group Aggregate, due to over-estimate of the number of groups EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; QUERY PLAN ----------------------------------- GroupAggregate Group Key: a, b -> Sort Sort Key: a, b -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c; QUERY PLAN ----------------------------------- GroupAggregate Group Key: b, c -> Sort Sort Key: b, c -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; QUERY PLAN ----------------------------------- GroupAggregate Group Key: a, b, c -> Sort Sort Key: a, b, c -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; QUERY PLAN ----------------------------------- GroupAggregate Group Key: a, b, c, d -> Sort Sort Key: a, b, c, d -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; QUERY PLAN ----------------------------------- GroupAggregate Group Key: b, c, d -> Sort Sort Key: b, c, d -> Seq Scan on ndistinct (5 rows) -- unknown column CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct; ERROR: column "unknown_column" referenced in statistics does not exist -- single column CREATE STATISTICS s10 ON (a) FROM ndistinct; ERROR: statistics require at least 2 columns -- single column, duplicated CREATE STATISTICS s10 ON (a,a) FROM ndistinct; ERROR: duplicate column name in statistics definition -- two columns, one duplicated CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct; ERROR: duplicate column name in statistics definition -- correct command CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct; ANALYZE ndistinct; SELECT staenabled, standistinct FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; staenabled | standistinct ------------+------------------------------------------------------------------------------------------------ {d} | [{(b 3 4), 301.000000}, {(b 3 6), 301.000000}, {(b 4 6), 301.000000}, {(b 3 4 6), 301.000000}] (1 row) -- Hash Aggregate, thanks to estimates improved by the statistic EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; QUERY PLAN ----------------------------- HashAggregate Group Key: a, b -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c; QUERY PLAN ----------------------------- HashAggregate Group Key: b, c -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; QUERY PLAN ----------------------------- HashAggregate Group Key: a, b, c -> Seq Scan on ndistinct (3 rows) -- last two plans keep using Group Aggregate, because 'd' is not covered -- by the statistic and while it's NULL-only we assume 200 values for it EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; QUERY PLAN ----------------------------------- GroupAggregate Group Key: a, b, c, d -> Sort Sort Key: a, b, c, d -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; QUERY PLAN ----------------------------------- GroupAggregate Group Key: b, c, d -> Sort Sort Key: b, c, d -> Seq Scan on ndistinct (5 rows) TRUNCATE TABLE ndistinct; -- under-estimates when using only per-column statistics INSERT INTO ndistinct (a, b, c, filler1) SELECT mod(i,50), mod(i,51), mod(i,32), cash_words(mod(i,33)::int::money) FROM generate_series(1,10000) s(i); ANALYZE ndistinct; SELECT staenabled, standistinct FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; staenabled | standistinct ------------+---------------------------------------------------------------------------------------------------- {d} | [{(b 3 4), 2550.000000}, {(b 3 6), 800.000000}, {(b 4 6), 1632.000000}, {(b 3 4 6), 10000.000000}] (1 row) -- plans using Group Aggregate, thanks to using correct esimates EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; QUERY PLAN ----------------------------------- GroupAggregate Group Key: a, b -> Sort Sort Key: a, b -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; QUERY PLAN ----------------------------------- GroupAggregate Group Key: a, b, c -> Sort Sort Key: a, b, c -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; QUERY PLAN ----------------------------------- GroupAggregate Group Key: a, b, c, d -> Sort Sort Key: a, b, c, d -> Seq Scan on ndistinct (5 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; QUERY PLAN ----------------------------- HashAggregate Group Key: b, c, d -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, d; QUERY PLAN ----------------------------- HashAggregate Group Key: a, d -> Seq Scan on ndistinct (3 rows) DROP STATISTICS s10; SELECT staenabled, standistinct FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass; staenabled | standistinct ------------+-------------- (0 rows) -- dropping the statistics switches the plans to Hash Aggregate, -- due to under-estimates EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b; QUERY PLAN ----------------------------- HashAggregate Group Key: a, b -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c; QUERY PLAN ----------------------------- HashAggregate Group Key: a, b, c -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d; QUERY PLAN ----------------------------- HashAggregate Group Key: a, b, c, d -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d; QUERY PLAN ----------------------------- HashAggregate Group Key: b, c, d -> Seq Scan on ndistinct (3 rows) EXPLAIN (COSTS off) SELECT COUNT(*) FROM ndistinct GROUP BY a, d; QUERY PLAN ----------------------------- HashAggregate Group Key: a, d -> Seq Scan on ndistinct (3 rows) DROP TABLE ndistinct;