-- Generic extended statistics support
-- We will be checking execution plans without/with statistics, so
-- let's make sure we get simple non-parallel plans. Also set the
-- work_mem low so that we can use small amounts of data.
SET max_parallel_workers = 0;
SET max_parallel_workers_per_gather = 0;
SET work_mem = '128kB';
-- Ensure stats are dropped sanely
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
DROP STATISTICS ab1_a_b_stats;
CREATE SCHEMA regress_schema_2;
CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1;
-- Let's also verify the pg_get_statisticsextdef output looks sane.
SELECT pg_get_statisticsextdef(oid) FROM pg_statistic_ext WHERE staname = 'ab1_a_b_stats';
                       pg_get_statisticsextdef                       
---------------------------------------------------------------------
 CREATE STATISTICS regress_schema_2.ab1_a_b_stats ON (a, b) FROM ab1
(1 row)

DROP STATISTICS regress_schema_2.ab1_a_b_stats;
-- Ensure statistics are dropped when columns are
CREATE STATISTICS ab1_b_c_stats ON (b, c) FROM ab1;
CREATE STATISTICS ab1_a_b_c_stats ON (a, b, c) FROM ab1;
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
ALTER TABLE ab1 DROP COLUMN a;
\d ab1
                Table "public.ab1"
 Column |  Type   | Collation | Nullable | Default 
--------+---------+-----------+----------+---------
 b      | integer |           |          | 
 c      | integer |           |          | 
Statistics:
    "public.ab1_b_c_stats" WITH (ndistinct) ON (b, c)

DROP TABLE ab1;
-- Ensure things work sanely with SET STATISTICS 0
CREATE TABLE ab1 (a INTEGER, b INTEGER);
ALTER TABLE ab1 ALTER a SET STATISTICS 0;
INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a;
CREATE STATISTICS ab1_a_b_stats ON (a, b) FROM ab1;
ANALYZE ab1;
ERROR:  extended statistics could not be collected for column "a" of relation public.ab1
HINT:  Consider ALTER TABLE "public"."ab1" ALTER "a" SET STATISTICS -1
ALTER TABLE ab1 ALTER a SET STATISTICS -1;
ANALYZE ab1;
DROP TABLE ab1;
-- n-distinct tests
CREATE TABLE ndistinct (
    filler1 TEXT,
    filler2 NUMERIC,
    a INT,
    b INT,
    filler3 DATE,
    c INT,
    d INT
);
-- over-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1)
     SELECT i/100, i/100, i/100, cash_words((i/100)::money)
       FROM generate_series(1,30000) s(i);
ANALYZE ndistinct;
-- Group Aggregate, due to over-estimate of the number of groups
EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: a, b
   ->  Sort
         Sort Key: a, b
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: b, c
   ->  Sort
         Sort Key: b, c
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: a, b, c
   ->  Sort
         Sort Key: a, b, c
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: a, b, c, d
   ->  Sort
         Sort Key: a, b, c, d
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: b, c, d
   ->  Sort
         Sort Key: b, c, d
         ->  Seq Scan on ndistinct
(5 rows)

-- unknown column
CREATE STATISTICS s10 ON (unknown_column) FROM ndistinct;
ERROR:  column "unknown_column" referenced in statistics does not exist
-- single column
CREATE STATISTICS s10 ON (a) FROM ndistinct;
ERROR:  statistics require at least 2 columns
-- single column, duplicated
CREATE STATISTICS s10 ON (a,a) FROM ndistinct;
ERROR:  duplicate column name in statistics definition
-- two columns, one duplicated
CREATE STATISTICS s10 ON (a, a, b) FROM ndistinct;
ERROR:  duplicate column name in statistics definition
-- correct command
CREATE STATISTICS s10 ON (a, b, c) FROM ndistinct;
ANALYZE ndistinct;
SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled |                                          standistinct                                          
------------+------------------------------------------------------------------------------------------------
 {d}        | [{(b 3 4), 301.000000}, {(b 3 6), 301.000000}, {(b 4 6), 301.000000}, {(b 3 4 6), 301.000000}]
(1 row)

-- Hash Aggregate, thanks to estimates improved by the statistic
EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: a, b
   ->  Seq Scan on ndistinct
(3 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: b, c
   ->  Seq Scan on ndistinct
(3 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: a, b, c
   ->  Seq Scan on ndistinct
(3 rows)

-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: a, b, c, d
   ->  Sort
         Sort Key: a, b, c, d
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: b, c, d
   ->  Sort
         Sort Key: b, c, d
         ->  Seq Scan on ndistinct
(5 rows)

TRUNCATE TABLE ndistinct;
-- under-estimates when using only per-column statistics
INSERT INTO ndistinct (a, b, c, filler1)
     SELECT mod(i,50), mod(i,51), mod(i,32),
            cash_words(mod(i,33)::int::money)
       FROM generate_series(1,10000) s(i);
ANALYZE ndistinct;
SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled |                                            standistinct                                            
------------+----------------------------------------------------------------------------------------------------
 {d}        | [{(b 3 4), 2550.000000}, {(b 3 6), 800.000000}, {(b 4 6), 1632.000000}, {(b 3 4 6), 10000.000000}]
(1 row)

-- plans using Group Aggregate, thanks to using correct esimates
EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: a, b
   ->  Sort
         Sort Key: a, b
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: a, b, c
   ->  Sort
         Sort Key: a, b, c
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
            QUERY PLAN             
-----------------------------------
 GroupAggregate
   Group Key: a, b, c, d
   ->  Sort
         Sort Key: a, b, c, d
         ->  Seq Scan on ndistinct
(5 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: b, c, d
   ->  Seq Scan on ndistinct
(3 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: a, d
   ->  Seq Scan on ndistinct
(3 rows)

DROP STATISTICS s10;
SELECT staenabled, standistinct
  FROM pg_statistic_ext WHERE starelid = 'ndistinct'::regclass;
 staenabled | standistinct 
------------+--------------
(0 rows)

-- dropping the statistics switches the plans to Hash Aggregate,
-- due to under-estimates
EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: a, b
   ->  Seq Scan on ndistinct
(3 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: a, b, c
   ->  Seq Scan on ndistinct
(3 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: a, b, c, d
   ->  Seq Scan on ndistinct
(3 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: b, c, d
   ->  Seq Scan on ndistinct
(3 rows)

EXPLAIN (COSTS off)
 SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
         QUERY PLAN          
-----------------------------
 HashAggregate
   Group Key: a, d
   ->  Seq Scan on ndistinct
(3 rows)

DROP TABLE ndistinct;