Collect and use element-frequency statistics for arrays.

This patch improves selectivity estimation for the array <@, &&, and @> (containment and overlaps) operators. It enables collection of statistics about individual array element values by ANALYZE, and introduces operator-specific estimators that use these stats. In addition, ScalarArrayOpExpr constructs of the forms "const = ANY/ALL (array_column)" and "const <> ANY/ALL (array_column)" are estimated by treating them as variants of the containment operators. Since we still collect scalar-style stats about the array values as a whole, the pg_stats view is expanded to show both these stats and the array-style stats in separate columns. This creates an incompatible change in how stats for tsvector columns are displayed in pg_stats: the stats about lexemes are now displayed in the array-related columns instead of the original scalar-related columns. There are a few loose ends here, notably that it'd be nice to be able to suppress either the scalar-style stats or the array-element stats for columns for which they're not useful. But the patch is in good enough shape to commit for wider testing. Alexander Korotkov, reviewed by Noah Misch and Nathan Boley
author: Tom Lane <tgl@sss.pgh.pa.us> 2012-03-03 20:20:19 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2012-03-03 20:20:57 -0500
commit: 0e5e167aaea4ceb355a6e20eec96c4f7d05527ab (patch)
tree: 1b1b338461cba27a2d783db13b74d1b7b86b6681 /src/backend
parent: 34c978442c55dd13a3a8c6b90fd4380dad02f3da (diff)
download: postgresql-0e5e167aaea4ceb355a6e20eec96c4f7d05527ab.tar.gz
10 files changed, 2086 insertions, 34 deletions
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index aef410ae9b..a8653cd495 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1182,7 +1182,7 @@ heap_create_with_catalog(const char *relname,
 				   F_ARRAY_SEND,	/* array send (bin) proc */
 				   InvalidOid,	/* typmodin procedure - none */
 				   InvalidOid,	/* typmodout procedure - none */
-				   InvalidOid,	/* analyze procedure - default */
+				   F_ARRAY_TYPANALYZE,	/* array analyze procedure */
 				   new_type_oid,	/* array element type - the rowtype */
 				   true,		/* yes, this is an array type */
 				   InvalidOid,	/* this has no array type */
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 30b0bd06df..ab594eba9b 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -117,29 +117,54 @@ CREATE VIEW pg_stats AS
         stawidth AS avg_width,
         stadistinct AS n_distinct,
         CASE
-            WHEN stakind1 IN (1, 4) THEN stavalues1
-            WHEN stakind2 IN (1, 4) THEN stavalues2
-            WHEN stakind3 IN (1, 4) THEN stavalues3
-            WHEN stakind4 IN (1, 4) THEN stavalues4
+            WHEN stakind1 = 1 THEN stavalues1
+            WHEN stakind2 = 1 THEN stavalues2
+            WHEN stakind3 = 1 THEN stavalues3
+            WHEN stakind4 = 1 THEN stavalues4
+            WHEN stakind5 = 1 THEN stavalues5
         END AS most_common_vals,
         CASE
-            WHEN stakind1 IN (1, 4) THEN stanumbers1
-            WHEN stakind2 IN (1, 4) THEN stanumbers2
-            WHEN stakind3 IN (1, 4) THEN stanumbers3
-            WHEN stakind4 IN (1, 4) THEN stanumbers4
+            WHEN stakind1 = 1 THEN stanumbers1
+            WHEN stakind2 = 1 THEN stanumbers2
+            WHEN stakind3 = 1 THEN stanumbers3
+            WHEN stakind4 = 1 THEN stanumbers4
+            WHEN stakind5 = 1 THEN stanumbers5
         END AS most_common_freqs,
         CASE
             WHEN stakind1 = 2 THEN stavalues1
             WHEN stakind2 = 2 THEN stavalues2
             WHEN stakind3 = 2 THEN stavalues3
             WHEN stakind4 = 2 THEN stavalues4
+            WHEN stakind5 = 2 THEN stavalues5
         END AS histogram_bounds,
         CASE
             WHEN stakind1 = 3 THEN stanumbers1[1]
             WHEN stakind2 = 3 THEN stanumbers2[1]
             WHEN stakind3 = 3 THEN stanumbers3[1]
             WHEN stakind4 = 3 THEN stanumbers4[1]
-        END AS correlation
+            WHEN stakind5 = 3 THEN stanumbers5[1]
+        END AS correlation,
+        CASE
+            WHEN stakind1 = 4 THEN stavalues1
+            WHEN stakind2 = 4 THEN stavalues2
+            WHEN stakind3 = 4 THEN stavalues3
+            WHEN stakind4 = 4 THEN stavalues4
+            WHEN stakind5 = 4 THEN stavalues5
+        END AS most_common_elems,
+        CASE
+            WHEN stakind1 = 4 THEN stanumbers1
+            WHEN stakind2 = 4 THEN stanumbers2
+            WHEN stakind3 = 4 THEN stanumbers3
+            WHEN stakind4 = 4 THEN stanumbers4
+            WHEN stakind5 = 4 THEN stanumbers5
+        END AS most_common_elem_freqs,
+        CASE
+            WHEN stakind1 = 5 THEN stanumbers1
+            WHEN stakind2 = 5 THEN stanumbers2
+            WHEN stakind3 = 5 THEN stanumbers3
+            WHEN stakind4 = 5 THEN stanumbers4
+            WHEN stakind5 = 5 THEN stanumbers5
+        END AS elem_count_histogram
     FROM pg_statistic s JOIN pg_class c ON (c.oid = s.starelid)
          JOIN pg_attribute a ON (c.oid = attrelid AND attnum = s.staattnum)
          LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace)
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index b40e57b14f..9cd6e672ce 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -110,8 +110,6 @@ static void update_attstats(Oid relid, bool inh,
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 
-static bool std_typanalyze(VacAttrStats *stats);
-
 
 /*
  *	analyze_rel() -- analyze one relation
@@ -476,8 +474,7 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh)
 		for (i = 0; i < attr_cnt; i++)
 		{
 			VacAttrStats *stats = vacattrstats[i];
-			AttributeOpts *aopt =
-			get_attribute_options(onerel->rd_id, stats->attr->attnum);
+			AttributeOpts *aopt;
 
 			stats->rows = rows;
 			stats->tupDesc = onerel->rd_att;
@@ -490,11 +487,12 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt, bool inh)
 			 * If the appropriate flavor of the n_distinct option is
 			 * specified, override with the corresponding value.
 			 */
+			aopt = get_attribute_options(onerel->rd_id, stats->attr->attnum);
 			if (aopt != NULL)
 			{
-				float8		n_distinct =
-				inh ? aopt->n_distinct_inherited : aopt->n_distinct;
+				float8		n_distinct;
 
+				n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct;
 				if (n_distinct != 0.0)
 					stats->stadistinct = n_distinct;
 			}
@@ -1794,7 +1792,7 @@ static int	compare_mcvs(const void *a, const void *b);
 /*
  * std_typanalyze -- the default type-specific typanalyze function
  */
-static bool
+bool
 std_typanalyze(VacAttrStats *stats)
 {
 	Form_pg_attribute attr = stats->attr;
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 22c1132e9b..37fe5e8dae 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -609,7 +609,7 @@ DefineType(List *names, List *parameters)
 			   F_ARRAY_SEND,	/* send procedure */
 			   typmodinOid,		/* typmodin procedure */
 			   typmodoutOid,	/* typmodout procedure */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   typoid,			/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
@@ -1140,7 +1140,7 @@ DefineEnum(CreateEnumStmt *stmt)
 			   F_ARRAY_SEND,	/* send procedure */
 			   InvalidOid,		/* typmodin procedure - none */
 			   InvalidOid,		/* typmodout procedure - none */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   enumTypeOid,		/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
@@ -1450,7 +1450,7 @@ DefineRange(CreateRangeStmt *stmt)
 			   F_ARRAY_SEND,	/* send procedure */
 			   InvalidOid,		/* typmodin procedure - none */
 			   InvalidOid,		/* typmodout procedure - none */
-			   InvalidOid,		/* analyze procedure - default */
+			   F_ARRAY_TYPANALYZE,	/* analyze procedure */
 			   typoid,			/* element type ID */
 			   true,			/* yes this is an array type */
 			   InvalidOid,		/* no further array type */
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
index 0922777505..a07d410005 100644
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -220,6 +220,10 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
 	/*
 	 * There should be two more Numbers than Values, because the last two
 	 * cells are taken for minimal and maximal frequency.  Punt if not.
+	 *
+	 * (Note: the MCELEM statistics slot definition allows for a third extra
+	 * number containing the frequency of nulls, but we're not expecting that
+	 * to appear for a tsvector column.)
 	 */
 	if (nnumbers != nmcelem + 2)
 		return tsquery_opr_selec_no_stats(query);
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 15fae1c95f..9771415b2e 100644
--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
@@ -377,6 +377,11 @@ compute_tsvector_stats(VacAttrStats *stats,
 			 * able to find out the minimal and maximal frequency without
 			 * going through all the values.  We keep those two extra
 			 * frequencies in two extra cells in mcelem_freqs.
+			 *
+			 * (Note: the MCELEM statistics slot definition allows for a third
+			 * extra number containing the frequency of nulls, but we don't
+			 * create that for a tsvector column, since null elements aren't
+			 * possible.)
 			 */
 			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
 			mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index c635c38f5b..c5b0a75e93 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -15,7 +15,8 @@ override CFLAGS+= -mieee
 endif
 endif
 
-OBJS = acl.o arrayfuncs.o array_userfuncs.o arrayutils.o bool.o \
+OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
+	array_userfuncs.o arrayutils.o bool.o \
 	cash.o char.o date.o datetime.o datum.o domains.o \
 	enum.o float.o format_type.o \
 	geo_ops.o geo_selfuncs.o int.o int8.o json.o like.o lockfuncs.o \
diff --git a/src/backend/utils/adt/array_selfuncs.c b/src/backend/utils/adt/array_selfuncs.c
new file mode 100644
index 0000000000..3916de4bfb
--- /dev/null
+++ b/src/backend/utils/adt/array_selfuncs.c
@@ -0,0 +1,1225 @@
+/*-------------------------------------------------------------------------
+ *
+ * array_selfuncs.c
+ *	  Functions for selectivity estimation of array operators
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/array_selfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <math.h>
+
+#include "catalog/pg_collation.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_statistic.h"
+#include "optimizer/clauses.h"
+#include "utils/array.h"
+#include "utils/lsyscache.h"
+#include "utils/selfuncs.h"
+#include "utils/typcache.h"
+
+
+/* Default selectivity constant for "@>" and "<@" operators */
+#define DEFAULT_CONTAIN_SEL 0.005
+
+/* Default selectivity constant for "&&" operator */
+#define DEFAULT_OVERLAP_SEL 0.01
+
+/* Default selectivity for given operator */
+#define DEFAULT_SEL(operator) \
+	((operator) == OID_ARRAY_OVERLAP_OP ? \
+		DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
+
+static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
+				  Oid elemtype, Oid operator);
+static Selectivity mcelem_array_selec(ArrayType *array,
+				   TypeCacheEntry *typentry,
+				   Datum *mcelem, int nmcelem,
+				   float4 *numbers, int nnumbers,
+				   float4 *hist, int nhist,
+				   Oid operator, FmgrInfo *cmpfunc);
+static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
+								   float4 *numbers, int nnumbers,
+								   Datum *array_data, int nitems,
+								   Oid operator, FmgrInfo *cmpfunc);
+static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
+							 float4 *numbers, int nnumbers,
+							 Datum *array_data, int nitems,
+							 float4 *hist, int nhist,
+							 Oid operator, FmgrInfo *cmpfunc);
+static float *calc_hist(const float4 *hist, int nhist, int n);
+static float *calc_distr(const float *p, int n, int m, float rest);
+static int	floor_log2(uint32 n);
+static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
+				 int *index, FmgrInfo *cmpfunc);
+static int	element_compare(const void *key1, const void *key2, void *arg);
+static int	float_compare_desc(const void *key1, const void *key2);
+
+
+/*
+ * scalararraysel_containment
+ *		Estimate selectivity of ScalarArrayOpExpr via array containment.
+ *
+ * scalararraysel() has already verified that the operator of a
+ * ScalarArrayOpExpr is the array element type's default equality or
+ * inequality operator.  If we have const =/<> ANY/ALL (array_var)
+ * then we can estimate the selectivity as though this were an array
+ * containment operator, array_var op ARRAY[const].
+ *
+ * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
+ */
+Selectivity
+scalararraysel_containment(PlannerInfo *root,
+						   Node *leftop, Node *rightop,
+						   Oid elemtype, bool isEquality, bool useOr,
+						   int varRelid)
+{
+	Selectivity selec;
+	VariableStatData vardata;
+	Datum		constval;
+	TypeCacheEntry *typentry;
+	FmgrInfo   *cmpfunc;
+
+	/*
+	 * rightop must be a variable, else punt.
+	 */
+	examine_variable(root, rightop, varRelid, &vardata);
+	if (!vardata.rel)
+	{
+		ReleaseVariableStats(vardata);
+		return -1.0;
+	}
+
+	/*
+	 * Aggressively reduce leftop to a constant, if possible.
+	 */
+	leftop = estimate_expression_value(root, leftop);
+	if (!IsA(leftop, Const))
+	{
+		ReleaseVariableStats(vardata);
+		return -1.0;
+	}
+	if (((Const *) leftop)->constisnull)
+	{
+		/* qual can't succeed if null on left */
+		ReleaseVariableStats(vardata);
+		return (Selectivity) 0.0;
+	}
+	constval = ((Const *) leftop)->constvalue;
+
+	/* Get element type's default comparison function */
+	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
+	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
+	{
+		ReleaseVariableStats(vardata);
+		return -1.0;
+	}
+	cmpfunc = &typentry->cmp_proc_finfo;
+
+	/*
+	 * If the operator is <>, swap ANY/ALL, then invert the result later.
+	 */
+	if (!isEquality)
+		useOr = !useOr;
+
+	/* Get array element stats for var, if available */
+	if (HeapTupleIsValid(vardata.statsTuple))
+	{
+		Form_pg_statistic stats;
+		Datum	   *values;
+		int			nvalues;
+		float4	   *numbers;
+		int			nnumbers;
+		float4	   *hist;
+		int			nhist;
+
+		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
+
+		/* MCELEM will be an array of same type as element */
+		if (get_attstatsslot(vardata.statsTuple,
+							 elemtype, vardata.atttypmod,
+							 STATISTIC_KIND_MCELEM, InvalidOid,
+							 NULL,
+							 &values, &nvalues,
+							 &numbers, &nnumbers))
+		{
+			/* For ALL case, also get histogram of distinct-element counts */
+			if (useOr ||
+				!get_attstatsslot(vardata.statsTuple,
+								  elemtype, vardata.atttypmod,
+								  STATISTIC_KIND_DECHIST, InvalidOid,
+								  NULL,
+								  NULL, NULL,
+								  &hist, &nhist))
+			{
+				hist = NULL;
+				nhist = 0;
+			}
+
+			/*
+			 * For = ANY, estimate as var @> ARRAY[const].
+			 *
+			 * For = ALL, estimate as var <@ ARRAY[const].
+			 */
+			if (useOr)
+				selec = mcelem_array_contain_overlap_selec(values, nvalues,
+														   numbers, nnumbers,
+														   &constval, 1,
+														   OID_ARRAY_CONTAINS_OP,
+														   cmpfunc);
+			else
+				selec = mcelem_array_contained_selec(values, nvalues,
+													 numbers, nnumbers,
+													 &constval, 1,
+													 hist, nhist,
+													 OID_ARRAY_CONTAINED_OP,
+													 cmpfunc);
+
+			if (hist)
+				free_attstatsslot(elemtype, NULL, 0, hist, nhist);
+			free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
+		}
+		else
+		{
+			/* No most-common-elements info, so do without */
+			if (useOr)
+				selec = mcelem_array_contain_overlap_selec(NULL, 0,
+														   NULL, 0,
+														   &constval, 1,
+														   OID_ARRAY_CONTAINS_OP,
+														   cmpfunc);
+			else
+				selec = mcelem_array_contained_selec(NULL, 0,
+													 NULL, 0,
+													 &constval, 1,
+													 NULL, 0,
+													 OID_ARRAY_CONTAINED_OP,
+													 cmpfunc);
+		}
+
+		/*
+		 * MCE stats count only non-null rows, so adjust for null rows.
+		 */
+		selec *= (1.0 - stats->stanullfrac);
+	}
+	else
+	{
+		/* No stats at all, so do without */
+		if (useOr)
+			selec = mcelem_array_contain_overlap_selec(NULL, 0,
+													   NULL, 0,
+													   &constval, 1,
+													   OID_ARRAY_CONTAINS_OP,
+													   cmpfunc);
+		else
+			selec = mcelem_array_contained_selec(NULL, 0,
+												 NULL, 0,
+												 &constval, 1,
+												 NULL, 0,
+												 OID_ARRAY_CONTAINED_OP,
+												 cmpfunc);
+		/* we assume no nulls here, so no stanullfrac correction */
+	}
+
+	ReleaseVariableStats(vardata);
+
+	/*
+	 * If the operator is <>, invert the results.
+	 */
+	if (!isEquality)
+		selec = 1.0 - selec;
+
+	CLAMP_PROBABILITY(selec);
+
+	return selec;
+}
+
+/*
+ * arraycontsel -- restriction selectivity for "arraycolumn @> const",
+ * "arraycolumn && const" or "arraycolumn <@ const"
+ */
+Datum
+arraycontsel(PG_FUNCTION_ARGS)
+{
+	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
+	Oid			operator = PG_GETARG_OID(1);
+	List	   *args = (List *) PG_GETARG_POINTER(2);
+	int			varRelid = PG_GETARG_INT32(3);
+	VariableStatData vardata;
+	Node	   *other;
+	bool		varonleft;
+	Selectivity selec;
+	Oid			element_typeid;
+
+	/*
+	 * If expression is not (variable op something) or (something op
+	 * variable), then punt and return a default estimate.
+	 */
+	if (!get_restriction_variable(root, args, varRelid,
+								  &vardata, &other, &varonleft))
+		PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
+
+	/*
+	 * Can't do anything useful if the something is not a constant, either.
+	 */
+	if (!IsA(other, Const))
+	{
+		ReleaseVariableStats(vardata);
+		PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
+	}
+
+	/*
+	 * The "&&", "@>" and "<@" operators are strict, so we can cope with a
+	 * NULL constant right away.
+	 */
+	if (((Const *) other)->constisnull)
+	{
+		ReleaseVariableStats(vardata);
+		PG_RETURN_FLOAT8(0.0);
+	}
+
+	/*
+	 * If var is on the right, commute the operator, so that we can assume
+	 * the var is on the left in what follows.
+	 */
+	if (!varonleft)
+	{
+		if (operator == OID_ARRAY_CONTAINS_OP)
+			operator = OID_ARRAY_CONTAINED_OP;
+		else if (operator == OID_ARRAY_CONTAINED_OP)
+			operator = OID_ARRAY_CONTAINS_OP;
+	}
+
+	/*
+	 * OK, there's a Var and a Const we're dealing with here.  We need the
+	 * Const to be a array with same element type as column, else we can't do
+	 * anything useful.  (Such cases will likely fail at runtime, but here
+	 * we'd rather just return a default estimate.)
+	 */
+	element_typeid = get_base_element_type(((Const *) other)->consttype);
+	if (element_typeid != InvalidOid &&
+		element_typeid == get_base_element_type(vardata.vartype))
+	{
+		selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
+								  element_typeid, operator);
+	}
+	else
+	{
+		selec = DEFAULT_SEL(operator);
+	}
+
+	ReleaseVariableStats(vardata);
+
+	CLAMP_PROBABILITY(selec);
+
+	PG_RETURN_FLOAT8((float8) selec);
+}
+
+/*
+ * arraycontjoinsel -- join selectivity for "arraycolumn @> const",
+ * "arraycolumn && const" or "arraycolumn <@ const"
+ */
+Datum
+arraycontjoinsel(PG_FUNCTION_ARGS)
+{
+	/* For the moment this is just a stub */
+	Oid			operator = PG_GETARG_OID(1);
+
+	PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
+}
+
+/*
+ * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
+ * or "arraycolumn <@ const" based on the statistics
+ *
+ * This function is mainly responsible for extracting the pg_statistic data
+ * to be used; we then pass the problem on to mcelem_array_selec().
+ */
+static Selectivity
+calc_arraycontsel(VariableStatData *vardata, Datum constval,
+				  Oid elemtype, Oid operator)
+{
+	Selectivity selec;
+	TypeCacheEntry *typentry;
+	FmgrInfo   *cmpfunc;
+	ArrayType  *array;
+
+	/* Get element type's default comparison function */
+	typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
+	if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
+		return DEFAULT_SEL(operator);
+	cmpfunc = &typentry->cmp_proc_finfo;
+
+	/*
+	 * The caller made sure the const is a array with same element type, so
+	 * get it now
+	 */
+	array = DatumGetArrayTypeP(constval);
+
+	if (HeapTupleIsValid(vardata->statsTuple))
+	{
+		Form_pg_statistic stats;
+		Datum	   *values;
+		int			nvalues;
+		float4	   *numbers;
+		int			nnumbers;
+		float4	   *hist;
+		int			nhist;
+
+		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+
+		/* MCELEM will be an array of same type as column */
+		if (get_attstatsslot(vardata->statsTuple,
+							 elemtype, vardata->atttypmod,
+							 STATISTIC_KIND_MCELEM, InvalidOid,
+							 NULL,
+							 &values, &nvalues,
+							 &numbers, &nnumbers))
+		{
+			/*
+			 * For "array <@ const" case we also need histogram of distinct
+			 * element counts.
+			 */
+			if (operator != OID_ARRAY_CONTAINED_OP ||
+				!get_attstatsslot(vardata->statsTuple,
+								  elemtype, vardata->atttypmod,
+								  STATISTIC_KIND_DECHIST, InvalidOid,
+								  NULL,
+								  NULL, NULL,
+								  &hist, &nhist))
+			{
+				hist = NULL;
+				nhist = 0;
+			}
+
+			/* Use the most-common-elements slot for the array Var. */
+			selec = mcelem_array_selec(array, typentry,
+									   values, nvalues,
+									   numbers, nnumbers,
+									   hist, nhist,
+									   operator, cmpfunc);
+
+			if (hist)
+				free_attstatsslot(elemtype, NULL, 0, hist, nhist);
+			free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
+		}
+		else
+		{
+			/* No most-common-elements info, so do without */
+			selec = mcelem_array_selec(array, typentry,
+									   NULL, 0, NULL, 0, NULL, 0,
+									   operator, cmpfunc);
+		}
+
+		/*
+		 * MCE stats count only non-null rows, so adjust for null rows.
+		 */
+		selec *= (1.0 - stats->stanullfrac);
+	}
+	else
+	{
+		/* No stats at all, so do without */
+		selec = mcelem_array_selec(array, typentry,
+								   NULL, 0, NULL, 0, NULL, 0,
+								   operator, cmpfunc);
+		/* we assume no nulls here, so no stanullfrac correction */
+	}
+
+	/* If constant was toasted, release the copy we made */
+	if (PointerGetDatum(array) != constval)
+		pfree(array);
+
+	return selec;
+}
+
+/*
+ * Array selectivity estimation based on most common elements statistics
+ *
+ * This function just deconstructs and sorts the array constant's contents,
+ * and then passes the problem on to mcelem_array_contain_overlap_selec or
+ * mcelem_array_contained_selec depending on the operator.
+ */
+static Selectivity
+mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
+				   Datum *mcelem, int nmcelem,
+				   float4 *numbers, int nnumbers,
+				   float4 *hist, int nhist,
+				   Oid operator, FmgrInfo *cmpfunc)
+{
+	Selectivity	selec;
+	int			num_elems;
+	Datum	   *elem_values;
+	bool	   *elem_nulls;
+	bool		null_present;
+	int			nonnull_nitems;
+	int			i;
+
+	/*
+	 * Prepare constant array data for sorting.  Sorting lets us find unique
+	 * elements and efficiently merge with the MCELEM array.
+	 */
+	deconstruct_array(array,
+					  typentry->type_id,
+					  typentry->typlen,
+					  typentry->typbyval,
+					  typentry->typalign,
+					  &elem_values, &elem_nulls, &num_elems);
+
+	/* Collapse out any null elements */
+	nonnull_nitems = 0;
+	null_present = false;
+	for (i = 0; i < num_elems; i++)
+	{
+		if (elem_nulls[i])
+			null_present = true;
+		else
+			elem_values[nonnull_nitems++] = elem_values[i];
+	}
+
+	/*
+	 * Query "column @> '{anything, null}'" matches nothing.  For the other
+	 * two operators, presence of a null in the constant can be ignored.
+	 */
+	if (null_present && operator == OID_ARRAY_CONTAINS_OP)
+	{
+		pfree(elem_values);
+		pfree(elem_nulls);
+		return (Selectivity) 0.0;
+	}
+
+	/* Sort extracted elements using their default comparison function. */
+	qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
+			  element_compare, cmpfunc);
+
+	/* Separate cases according to operator */
+	if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
+		selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
+												   numbers, nnumbers,
+												   elem_values, nonnull_nitems,
+												   operator, cmpfunc);
+	else if (operator == OID_ARRAY_CONTAINED_OP)
+		selec = mcelem_array_contained_selec(mcelem, nmcelem,
+											 numbers, nnumbers,
+											 elem_values, nonnull_nitems,
+											 hist, nhist,
+											 operator, cmpfunc);
+	else
+	{
+		elog(ERROR, "arraycontsel called for unrecognized operator %u",
+			 operator);
+		selec = 0.0;			/* keep compiler quiet */
+	}
+
+	pfree(elem_values);
+	pfree(elem_nulls);
+	return selec;
+}
+
+/*
+ * Estimate selectivity of "column @> const" and "column && const" based on
+ * most common element statistics.	This estimation assumes element
+ * occurrences are independent.
+ *
+ * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
+ * the array column's MCELEM statistics slot, or are NULL/0 if stats are
+ * not available.  array_data (of length nitems) is the constant's elements.
+ *
+ * Both the mcelem and array_data arrays are assumed presorted according
+ * to the element type's cmpfunc.  Null elements are not present.
+ *
+ * TODO: this estimate probably could be improved by using the distinct
+ * elements count histogram.  For example, excepting the special case of
+ * "column @> '{}'", we can multiply the calculated selectivity by the
+ * fraction of nonempty arrays in the column.
+ */
+static Selectivity
+mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
+								   float4 *numbers, int nnumbers,
+								   Datum *array_data, int nitems,
+								   Oid operator, FmgrInfo *cmpfunc)
+{
+	Selectivity selec,
+				elem_selec;
+	int			mcelem_index,
+				i;
+	bool		use_bsearch;
+	float4		minfreq;
+
+	/*
+	 * There should be three more Numbers than Values, because the last three
+	 * cells should hold minimal and maximal frequency among the non-null
+	 * elements, and then the frequency of null elements.  Ignore the Numbers
+	 * if not right.
+	 */
+	if (nnumbers != nmcelem + 3)
+	{
+		numbers = NULL;
+		nnumbers = 0;
+	}
+
+	if (numbers)
+	{
+		/* Grab the lowest observed frequency */
+		minfreq = numbers[nmcelem];
+	}
+	else
+	{
+		/* Without statistics make some default assumptions */
+		minfreq = 2 * DEFAULT_CONTAIN_SEL;
+	}
+
+	/* Decide whether it is faster to use binary search or not. */
+	if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
+		use_bsearch = true;
+	else
+		use_bsearch = false;
+
+	if (operator == OID_ARRAY_CONTAINS_OP)
+	{
+		/*
+		 * Initial selectivity for "column @> const" query is 1.0, and it will
+		 * be decreased with each element of constant array.
+		 */
+		selec = 1.0;
+	}
+	else
+	{
+		/*
+		 * Initial selectivity for "column && const" query is 0.0, and it will
+		 * be increased with each element of constant array.
+		 */
+		selec = 0.0;
+	}
+
+	/* Scan mcelem and array in parallel. */
+	mcelem_index = 0;
+	for (i = 0; i < nitems; i++)
+	{
+		bool		match = false;
+
+		/* Ignore any duplicates in the array data. */
+		if (i > 0 &&
+			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
+			continue;
+
+		/* Find the smallest MCELEM >= this array item. */
+		if (use_bsearch)
+		{
+			match = find_next_mcelem(mcelem, nmcelem, array_data[i],
+									 &mcelem_index, cmpfunc);
+		}
+		else
+		{
+			while (mcelem_index < nmcelem)
+			{
+				int			cmp = element_compare(&mcelem[mcelem_index],
+												  &array_data[i],
+												  cmpfunc);
+
+				if (cmp < 0)
+					mcelem_index++;
+				else
+				{
+					if (cmp == 0)
+						match = true; /* mcelem is found */
+					break;
+				}
+			}
+		}
+
+		if (match && numbers)
+		{
+			/* MCELEM matches the array item; use its frequency. */
+			elem_selec = numbers[mcelem_index];
+			mcelem_index++;
+		}
+		else
+		{
+			/*
+			 * The element is not in MCELEM.  Punt, but assume that the
+			 * selectivity cannot be more than minfreq / 2.
+			 */
+			elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
+		}
+
+		/*
+		 * Update overall selectivity using the current element's selectivity
+		 * and an assumption of element occurrence independence.
+		 */
+		if (operator == OID_ARRAY_CONTAINS_OP)
+			selec *= elem_selec;
+		else
+			selec = selec + elem_selec - selec * elem_selec;
+
+		/* Clamp intermediate results to stay sane despite roundoff error */
+		CLAMP_PROBABILITY(selec);
+	}
+
+	return selec;
+}
+
+/*
+ * Estimate selectivity of "column <@ const" based on most common element
+ * statistics.
+ *
+ * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
+ * the array column's MCELEM statistics slot, or are NULL/0 if stats are
+ * not available.  array_data (of length nitems) is the constant's elements.
+ * hist (of length nhist) is from the array column's DECHIST statistics slot,
+ * or is NULL/0 if those stats are not available.
+ *
+ * Both the mcelem and array_data arrays are assumed presorted according
+ * to the element type's cmpfunc.  Null elements are not present.
+ *
+ * Independent element occurrence would imply a particular distribution of
+ * distinct element counts among matching rows.  Real data usually falsifies
+ * that assumption.  For example, in a set of 11-element integer arrays having
+ * elements in the range [0..10], element occurrences are typically not
+ * independent.  If they were, a sufficiently-large set would include all
+ * distinct element counts 0 through 11.  We correct for this using the
+ * histogram of distinct element counts.
+ *
+ * In the "column @> const" and "column && const" cases, we usually have a
+ * "const" with low number of elements (otherwise we have selectivity close
+ * to 0 or 1 respectively).  That's why the effect of dependence related
+ * to distinct element count distribution is negligible there.  In the
+ * "column <@ const" case, number of elements is usually high (otherwise we
+ * have selectivity close to 0).  That's why we should do a correction with
+ * the array distinct element count distribution here.
+ *
+ * Using the histogram of distinct element counts produces a different
+ * distribution law than independent occurrences of elements.  This
+ * distribution law can be described as follows:
+ *
+ * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
+ *	  (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
+ *
+ * where:
+ * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
+ *		(1 - occurrence, 0 - no occurrence) in row
+ * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
+ *		(scalar values in [0..1]) according to collected statistics
+ * m = o1 + o2 + ... + on = total number of distinct elements in row
+ * hist[m] - histogram data for occurrence of m elements.
+ * ind[m] - probability of m occurrences from n events assuming their
+ *	  probabilities to be equal to frequencies of array elements.
+ *
+ * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
+ * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
+ */
+static Selectivity
+mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
+							 float4 *numbers, int nnumbers,
+							 Datum *array_data, int nitems,
+							 float4 *hist, int nhist,
+							 Oid operator, FmgrInfo *cmpfunc)
+{
+	int			mcelem_index,
+				i,
+				unique_nitems = 0;
+	float		selec,
+				minfreq,
+				nullelem_freq;
+	float	   *dist,
+			   *mcelem_dist,
+			   *hist_part;
+	float		avg_count,
+				mult,
+				rest;
+	float	   *elem_selec;
+
+	/*
+	 * There should be three more Numbers than Values in the MCELEM slot,
+	 * because the last three cells should hold minimal and maximal frequency
+	 * among the non-null elements, and then the frequency of null elements.
+	 * Punt if not right, because we can't do much without the element freqs.
+	 */
+	if (numbers == NULL || nnumbers != nmcelem + 3)
+		return DEFAULT_CONTAIN_SEL;
+
+	/*
+	 * Grab some of the summary statistics that compute_array_stats() stores:
+	 * lowest frequency, frequency of null elements, and average distinct
+	 * element count.
+	 */
+	minfreq = numbers[nmcelem];
+	nullelem_freq = numbers[nmcelem + 2];
+
+	if (hist && nhist > 0)
+		avg_count = hist[nhist - 1];
+	else
+		avg_count = 10.0f;		/* default assumption */
+
+	/*
+	 * "rest" will be the sum of the frequencies of all elements not
+	 * represented in MCELEM.  The average distinct element count is the sum
+	 * of the frequencies of *all* elements.  Begin with that; we will proceed
+	 * to subtract the MCELEM frequencies.
+	 */
+	rest = avg_count;
+
+	/*
+	 * mult is a multiplier representing estimate of probability that each
+	 * mcelem that is not present in constant doesn't occur.
+	 */
+	mult = 1.0f;
+
+	/*
+	 * elem_selec is array of estimated frequencies for elements in the
+	 * constant.
+	 */
+	elem_selec = (float *) palloc(sizeof(float) * nitems);
+
+	/* Scan mcelem and array in parallel. */
+	mcelem_index = 0;
+	for (i = 0; i < nitems; i++)
+	{
+		bool		match = false;
+
+		/* Ignore any duplicates in the array data. */
+		if (i > 0 &&
+			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
+			continue;
+
+		/*
+		 * Iterate over MCELEM until we find an entry greater than or equal to
+		 * this element of the constant.  Update "rest" and "mult" for mcelem
+		 * entries skipped over.
+		 */
+		while (mcelem_index < nmcelem)
+		{
+			int			cmp = element_compare(&mcelem[mcelem_index],
+											  &array_data[i],
+											  cmpfunc);
+
+			if (cmp < 0)
+			{
+				mult *= (1.0f - numbers[mcelem_index]);
+				rest -= numbers[mcelem_index];
+				mcelem_index++;
+			}
+			else
+			{
+				if (cmp == 0)
+					match = true; /* mcelem is found */
+				break;
+			}
+		}
+
+		if (match)
+		{
+			/* MCELEM matches the array item. */
+			elem_selec[unique_nitems] = numbers[mcelem_index];
+			/* "rest" is decremented for all mcelems, matched or not */
+			rest -= numbers[mcelem_index];
+			mcelem_index++;
+		}
+		else
+		{
+			/*
+			 * The element is not in MCELEM.  Punt, but assume that the
+			 * selectivity cannot be more than minfreq / 2.
+			 */
+			elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
+											minfreq / 2);
+		}
+
+		unique_nitems++;
+	}
+
+	/*
+	 * If we handled all constant elements without exhausting the MCELEM
+	 * array, finish walking it to complete calculation of "rest" and "mult".
+	 */
+	while (mcelem_index < nmcelem)
+	{
+		mult *= (1.0f - numbers[mcelem_index]);
+		rest -= numbers[mcelem_index];
+		mcelem_index++;
+	}
+
+	/*
+	 * The presence of many distinct rare elements materially decreases
+	 * selectivity.  Use the Poisson distribution to estimate the probability
+	 * of a column value having zero occurrences of such elements.	See above
+	 * for the definition of "rest".
+	 */
+	mult *= exp(-rest);
+
+	/* Check we have nonempty distinct element count histogram */
+	if (hist && nhist >= 3)
+	{
+		/*----------
+		 * Using the distinct element count histogram requires
+		 *		O(unique_nitems * (nmcelem + unique_nitems))
+		 * operations.  Beyond a certain computational cost threshold, it's
+		 * reasonable to sacrifice accuracy for decreased planning time.
+		 * We limit the number of operations to EFFORT * nmcelem; since
+		 * nmcelem is limited by the column's statistics target, the work
+		 * done is user-controllable.
+		 *
+		 * If the number of operations would be too large, we can reduce it
+		 * without losing all accuracy by reducing unique_nitems and
+		 * considering only the most-common elements of the constant array.
+		 * To make the results exactly match what we would have gotten with
+		 * only those elements to start with, we'd have to remove any
+		 * discarded elements' frequencies from "mult", but since this is only
+		 * an approximation anyway, we don't bother with that.  Therefore it's
+		 * sufficient to qsort elem_selec[] and take the largest elements.
+		 * (They will no longer match up with the elements of array_data[],
+		 * but we don't care.)
+		 *----------
+		 */
+#define EFFORT 100
+
+		if ((nmcelem + unique_nitems) > 0 &&
+			unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
+		{
+			/*
+			 * Use the quadratic formula to solve for largest allowable N;
+			 * we have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
+			 */
+			double	b = (double) nmcelem;
+			int		n;
+
+			n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
+
+			/* Sort, then take just the first n elements */
+			qsort(elem_selec, unique_nitems, sizeof(float),
+				  float_compare_desc);
+			unique_nitems = n;
+		}
+
+		/*
+		 * Calculate probabilities of each distinct element count for both
+		 * mcelems and constant elements.  At this point, assume independent
+		 * element occurrence.
+		 */
+		dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
+		mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
+
+		/* ignore hist[nhist-1], which is the avg not a histogram member */
+		hist_part = calc_hist(hist, nhist - 1, unique_nitems);
+
+		selec = 0.0f;
+		for (i = 0; i <= unique_nitems; i++)
+		{
+			/*
+			 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
+			 * matching from assumption of independent element occurrence with
+			 * the condition that distinct element count = i.
+			 */
+			if (mcelem_dist[i] > 0)
+				selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
+		}
+
+		pfree(dist);
+		pfree(mcelem_dist);
+		pfree(hist_part);
+	}
+	else
+	{
+		/* We don't have histogram.  Use a rough estimate. */
+		selec = mult;
+	}
+
+	pfree(elem_selec);
+
+	/* Take into account occurrence of NULL element. */
+	selec *= (1.0f - nullelem_freq);
+
+	CLAMP_PROBABILITY(selec);
+
+	return selec;
+}
+
+/*
+ * Calculate the first n distinct element count probabilities from a
+ * histogram of distinct element counts.
+ *
+ * Returns a palloc'd array of n+1 entries, with array[k] being the
+ * probability of element count k, k in [0..n].
+ *
+ * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
+ * (nhist - 1)) probability to each value in (a,b) and an additional half of
+ * that to a and b themselves.
+ */
+static float *
+calc_hist(const float4 *hist, int nhist, int n)
+{
+	float	   *hist_part;
+	int			k,
+				i = 0;
+	float		prev_interval = 0,
+				next_interval;
+	float		frac;
+
+	hist_part = (float *) palloc((n + 1) * sizeof(float));
+
+	/*
+	 * frac is a probability contribution for each interval between histogram
+	 * values.  We have nhist - 1 intervals, so contribution of each one will
+	 * be 1 / (nhist - 1).
+	 */
+	frac = 1.0f / ((float) (nhist - 1));
+
+	for (k = 0; k <= n; k++)
+	{
+		int			count = 0;
+
+		/*
+		 * Count the histogram boundaries equal to k.  (Although the histogram
+		 * should theoretically contain only exact integers, entries are
+		 * floats so there could be roundoff error in large values.  Treat any
+		 * fractional value as equal to the next larger k.)
+		 */
+		while (i < nhist && hist[i] <= k)
+		{
+			count++;
+			i++;
+		}
+
+		if (count > 0)
+		{
+			/* k is an exact bound for at least one histogram box. */
+			float		val;
+
+			/* Find length between current histogram value and the next one */
+			if (i < nhist)
+				next_interval = hist[i] - hist[i - 1];
+			else
+				next_interval = 0;
+
+			/*
+			 * count - 1 histogram boxes contain k exclusively.  They
+			 * contribute a total of (count - 1) * frac probability.  Also
+			 * factor in the partial histogram boxes on either side.
+			 */
+			val = (float) (count - 1);
+			if (next_interval > 0)
+				val += 0.5f / next_interval;
+			if (prev_interval > 0)
+				val += 0.5f / prev_interval;
+			hist_part[k] = frac * val;
+
+			prev_interval = next_interval;
+		}
+		else
+		{
+			/* k does not appear as an exact histogram bound. */
+			if (prev_interval > 0)
+				hist_part[k] = frac / prev_interval;
+			else
+				hist_part[k] = 0.0f;
+		}
+	}
+
+	return hist_part;
+}
+
+/*
+ * Consider n independent events with probabilities p[].  This function
+ * calculates probabilities of exact k of events occurrence for k in [0..m].
+ * Returns a palloc'd array of size m+1.
+ *
+ * "rest" is the sum of the probabilities of all low-probability events not
+ * included in p.
+ *
+ * Imagine matrix M of size (n + 1) x (m + 1).  Element M[i,j] denotes the
+ * probability that exactly j of first i events occur.	Obviously M[0,0] = 1.
+ * For any constant j, each increment of i increases the probability iff the
+ * event occurs.  So, by the law of total probability:
+ *	M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
+ *		for i > 0, j > 0.
+ *	M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
+ */
+static float *
+calc_distr(const float *p, int n, int m, float rest)
+{
+	float	   *row,
+			   *prev_row,
+			   *tmp;
+	int			i,
+				j;
+
+	/*
+	 * Since we return only the last row of the matrix and need only the
+	 * current and previous row for calculations, allocate two rows.
+	 */
+	row = (float *) palloc((m + 1) * sizeof(float));
+	prev_row = (float *) palloc((m + 1) * sizeof(float));
+
+	/* M[0,0] = 1 */
+	row[0] = 1.0f;
+	for (i = 1; i <= n; i++)
+	{
+		float		t = p[i - 1];
+
+		/* Swap rows */
+		tmp = row;
+		row = prev_row;
+		prev_row = tmp;
+
+		/* Calculate next row */
+		for (j = 0; j <= i && j <= m; j++)
+		{
+			float		val = 0.0f;
+
+			if (j < i)
+				val += prev_row[j] * (1.0f - t);
+			if (j > 0)
+				val += prev_row[j - 1] * t;
+			row[j] = val;
+		}
+	}
+
+	/*
+	 * The presence of many distinct rare (not in "p") elements materially
+	 * decreases selectivity.  Model their collective occurrence with the
+	 * Poisson distribution.
+	 */
+	if (rest > DEFAULT_CONTAIN_SEL)
+	{
+		float		t;
+
+		/* Swap rows */
+		tmp = row;
+		row = prev_row;
+		prev_row = tmp;
+
+		for (i = 0; i <= m; i++)
+			row[i] = 0.0f;
+
+		/* Value of Poisson distribution for 0 occurrences */
+		t = exp(-rest);
+
+		/*
+		 * Calculate convolution of previously computed distribution and the
+		 * Poisson distribution.
+		 */
+		for (i = 0; i <= m; i++)
+		{
+			for (j = 0; j <= m - i; j++)
+				row[j + i] += prev_row[j] * t;
+
+			/* Get Poisson distribution value for (i + 1) occurrences */
+			t *= rest / (float) (i + 1);
+		}
+	}
+
+	pfree(prev_row);
+	return row;
+}
+
+/* Fast function for floor value of 2 based logarithm calculation. */
+static int
+floor_log2(uint32 n)
+{
+	int			logval = 0;
+
+	if (n == 0)
+		return -1;
+	if (n >= (1 << 16))
+	{
+		n >>= 16;
+		logval += 16;
+	}
+	if (n >= (1 << 8))
+	{
+		n >>= 8;
+		logval += 8;
+	}
+	if (n >= (1 << 4))
+	{
+		n >>= 4;
+		logval += 4;
+	}
+	if (n >= (1 << 2))
+	{
+		n >>= 2;
+		logval += 2;
+	}
+	if (n >= (1 << 1))
+	{
+		logval += 1;
+	}
+	return logval;
+}
+
+/*
+ * find_next_mcelem binary-searches a most common elements array, starting
+ * from *index, for the first member >= value.	It saves the position of the
+ * match into *index and returns true if it's an exact match.  (Note: we
+ * assume the mcelem elements are distinct so there can't be more than one
+ * exact match.)
+ */
+static bool
+find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
+				 FmgrInfo *cmpfunc)
+{
+	int			l = *index,
+				r = nmcelem - 1,
+				i,
+				res;
+
+	while (l <= r)
+	{
+		i = (l + r) / 2;
+		res = element_compare(&mcelem[i], &value, cmpfunc);
+		if (res == 0)
+		{
+			*index = i;
+			return true;
+		}
+		else if (res < 0)
+			l = i + 1;
+		else
+			r = i - 1;
+	}
+	*index = l;
+	return false;
+}
+
+/*
+ * Comparison function for elements.
+ *
+ * We use the element type's default btree opclass, and the default collation
+ * if the type is collation-sensitive.
+ *
+ * XXX consider using SortSupport infrastructure
+ */
+static int
+element_compare(const void *key1, const void *key2, void *arg)
+{
+	Datum		d1 = *((const Datum *) key1);
+	Datum		d2 = *((const Datum *) key2);
+	FmgrInfo   *cmpfunc = (FmgrInfo *) arg;
+	Datum		c;
+
+	c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
+	return DatumGetInt32(c);
+}
+
+/*
+ * Comparison function for sorting floats into descending order.
+ */
+static int
+float_compare_desc(const void *key1, const void *key2)
+{
+	float		d1 = *((const float *) key1);
+	float		d2 = *((const float *) key2);
+
+	if (d1 > d2)
+		return -1;
+	else if (d1 < d2)
+		return 1;
+	else
+		return 0;
+}
diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c
new file mode 100644
index 0000000000..941e2adb03
--- /dev/null
+++ b/src/backend/utils/adt/array_typanalyze.c
@@ -0,0 +1,762 @@
+/*-------------------------------------------------------------------------
+ *
+ * array_typanalyze.c
+ *	  Functions for gathering statistics from array columns
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/array_typanalyze.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/tuptoaster.h"
+#include "catalog/pg_collation.h"
+#include "commands/vacuum.h"
+#include "utils/array.h"
+#include "utils/datum.h"
+#include "utils/typcache.h"
+
+
+/*
+ * To avoid consuming too much memory, IO and CPU load during analysis, and/or
+ * too much space in the resulting pg_statistic rows, we ignore arrays that
+ * are wider than ARRAY_WIDTH_THRESHOLD (after detoasting!).  Note that this
+ * number is considerably more than the similar WIDTH_THRESHOLD limit used
+ * in analyze.c's standard typanalyze code.
+ */
+#define ARRAY_WIDTH_THRESHOLD 0x10000
+
+/* Extra data for compute_array_stats function */
+typedef struct
+{
+	/* Information about array element type */
+	Oid			type_id;		/* element type's OID */
+	Oid			eq_opr;			/* default equality operator's OID */
+	bool		typbyval;		/* physical properties of element type */
+	int16		typlen;
+	char		typalign;
+
+	/*
+	 * Lookup data for element type's comparison and hash functions (these
+	 * are in the type's typcache entry, which we expect to remain valid
+	 * over the lifespan of the ANALYZE run)
+	 */
+	FmgrInfo   *cmp;
+	FmgrInfo   *hash;
+
+	/* Saved state from std_typanalyze() */
+	AnalyzeAttrComputeStatsFunc std_compute_stats;
+	void	   *std_extra_data;
+} ArrayAnalyzeExtraData;
+
+/*
+ * While compute_array_stats is running, we keep a pointer to the extra data
+ * here for use by assorted subroutines.  compute_array_stats doesn't
+ * currently need to be re-entrant, so avoiding this is not worth the extra
+ * notational cruft that would be needed.
+ */
+static ArrayAnalyzeExtraData *array_extra_data;
+
+/* A hash table entry for the Lossy Counting algorithm */
+typedef struct
+{
+	Datum		key;			/* This is 'e' from the LC algorithm. */
+	int			frequency;		/* This is 'f'. */
+	int			delta;			/* And this is 'delta'. */
+	int			last_container; /* For de-duplication of array elements. */
+} TrackItem;
+
+/* A hash table entry for distinct-elements counts */
+typedef struct
+{
+	int			count;			/* Count of distinct elements in an array */
+	int			frequency;		/* Number of arrays seen with this count */
+} DECountItem;
+
+static void compute_array_stats(VacAttrStats *stats,
+		   AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
+static void prune_element_hashtable(HTAB *elements_tab, int b_current);
+static uint32 element_hash(const void *key, Size keysize);
+static int	element_match(const void *key1, const void *key2, Size keysize);
+static int	element_compare(const void *key1, const void *key2);
+static int	trackitem_compare_frequencies_desc(const void *e1, const void *e2);
+static int	trackitem_compare_element(const void *e1, const void *e2);
+static int	countitem_compare_count(const void *e1, const void *e2);
+
+
+/*
+ * array_typanalyze -- typanalyze function for array columns
+ */
+Datum
+array_typanalyze(PG_FUNCTION_ARGS)
+{
+	VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
+	Oid			element_typeid;
+	TypeCacheEntry *typentry;
+	ArrayAnalyzeExtraData *extra_data;
+
+	/*
+	 * Call the standard typanalyze function.  It may fail to find needed
+	 * operators, in which case we also can't do anything, so just fail.
+	 */
+	if (!std_typanalyze(stats))
+		PG_RETURN_BOOL(false);
+
+	/*
+	 * Check attribute data type is a varlena array.
+	 */
+	element_typeid = stats->attrtype->typelem;
+
+	if (!OidIsValid(element_typeid) || stats->attrtype->typlen != -1)
+		elog(ERROR, "array_typanalyze was invoked for non-array type %u",
+			 stats->attrtypid);
+
+	/*
+	 * Gather information about the element type.  If we fail to find
+	 * something, return leaving the state from std_typanalyze() in place.
+	 */
+	typentry = lookup_type_cache(element_typeid,
+								 TYPECACHE_EQ_OPR |
+								 TYPECACHE_CMP_PROC_FINFO |
+								 TYPECACHE_HASH_PROC_FINFO);
+
+	if (!OidIsValid(typentry->eq_opr) ||
+		!OidIsValid(typentry->cmp_proc_finfo.fn_oid) ||
+		!OidIsValid(typentry->hash_proc_finfo.fn_oid))
+		PG_RETURN_BOOL(true);
+
+	/* Store our findings for use by compute_array_stats() */
+	extra_data = (ArrayAnalyzeExtraData *) palloc(sizeof(ArrayAnalyzeExtraData));
+	extra_data->type_id = typentry->type_id;
+	extra_data->eq_opr = typentry->eq_opr;
+	extra_data->typbyval = typentry->typbyval;
+	extra_data->typlen = typentry->typlen;
+	extra_data->typalign = typentry->typalign;
+	extra_data->cmp = &typentry->cmp_proc_finfo;
+	extra_data->hash = &typentry->hash_proc_finfo;
+
+	/* Save old compute_stats and extra_data for scalar statistics ... */
+	extra_data->std_compute_stats = stats->compute_stats;
+	extra_data->std_extra_data = stats->extra_data;
+
+	/* ... and replace with our info */
+	stats->compute_stats = compute_array_stats;
+	stats->extra_data = extra_data;
+
+	/*
+	 * Note we leave stats->minrows set as std_typanalyze set it.  Should
+	 * it be increased for array analysis purposes?
+	 */
+
+	PG_RETURN_BOOL(true);
+}
+
+/*
+ * compute_array_stats() -- compute statistics for a array column
+ *
+ * This function computes statistics useful for determining selectivity of
+ * the array operators <@, &&, and @>.  It is invoked by ANALYZE via the
+ * compute_stats hook after sample rows have been collected.
+ *
+ * We also invoke the standard compute_stats function, which will compute
+ * "scalar" statistics relevant to the btree-style array comparison operators.
+ * However, exact duplicates of an entire array may be rare despite many
+ * arrays sharing individual elements.  This especially afflicts long arrays,
+ * which are also liable to lack all scalar statistics due to the low
+ * WIDTH_THRESHOLD used in analyze.c.  So, in addition to the standard stats,
+ * we find the most common array elements and compute a histogram of distinct
+ * element counts.
+ *
+ * The algorithm used is Lossy Counting, as proposed in the paper "Approximate
+ * frequency counts over data streams" by G. S. Manku and R. Motwani, in
+ * Proceedings of the 28th International Conference on Very Large Data Bases,
+ * Hong Kong, China, August 2002, section 4.2. The paper is available at
+ * http://www.vldb.org/conf/2002/S10P03.pdf
+ *
+ * The Lossy Counting (aka LC) algorithm goes like this:
+ * Let s be the threshold frequency for an item (the minimum frequency we
+ * are interested in) and epsilon the error margin for the frequency. Let D
+ * be a set of triples (e, f, delta), where e is an element value, f is that
+ * element's frequency (actually, its current occurrence count) and delta is
+ * the maximum error in f. We start with D empty and process the elements in
+ * batches of size w. (The batch size is also known as "bucket size" and is
+ * equal to 1/epsilon.) Let the current batch number be b_current, starting
+ * with 1. For each element e we either increment its f count, if it's
+ * already in D, or insert a new triple into D with values (e, 1, b_current
+ * - 1). After processing each batch we prune D, by removing from it all
+ * elements with f + delta <= b_current.  After the algorithm finishes we
+ * suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
+ * where N is the total number of elements in the input.  We emit the
+ * remaining elements with estimated frequency f/N.  The LC paper proves
+ * that this algorithm finds all elements with true frequency at least s,
+ * and that no frequency is overestimated or is underestimated by more than
+ * epsilon.  Furthermore, given reasonable assumptions about the input
+ * distribution, the required table size is no more than about 7 times w.
+ *
+ * In the absence of a principled basis for other particular values, we
+ * follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10.
+ * But we leave out the correction for stopwords, which do not apply to
+ * arrays.  These parameters give bucket width w = K/0.007 and maximum
+ * expected hashtable size of about 1000 * K.
+ *
+ * Elements may repeat within an array.  Since duplicates do not change the
+ * behavior of <@, && or @>, we want to count each element only once per
+ * array.  Therefore, we store in the finished pg_statistic entry each
+ * element's frequency as the fraction of all non-null rows that contain it.
+ * We divide the raw counts by nonnull_cnt to get those figures.
+ */
+static void
+compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
+					int samplerows, double totalrows)
+{
+	ArrayAnalyzeExtraData *extra_data;
+	int			num_mcelem;
+	int			null_cnt = 0;
+	int			null_elem_cnt = 0;
+	int			analyzed_rows = 0;
+
+	/* This is D from the LC algorithm. */
+	HTAB	   *elements_tab;
+	HASHCTL		elem_hash_ctl;
+	HASH_SEQ_STATUS scan_status;
+
+	/* This is the current bucket number from the LC algorithm */
+	int			b_current;
+
+	/* This is 'w' from the LC algorithm */
+	int			bucket_width;
+	int			array_no;
+	int64		element_no;
+	TrackItem  *item;
+	int			slot_idx;
+	HTAB	   *count_tab;
+	HASHCTL		count_hash_ctl;
+	DECountItem *count_item;
+
+	extra_data = (ArrayAnalyzeExtraData *) stats->extra_data;
+
+	/*
+	 * Invoke analyze.c's standard analysis function to create scalar-style
+	 * stats for the column.  It will expect its own extra_data pointer,
+	 * so temporarily install that.
+	 */
+	stats->extra_data = extra_data->std_extra_data;
+	(*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows);
+	stats->extra_data = extra_data;
+
+	/*
+	 * Set up static pointer for use by subroutines.  We wait till here in
+	 * case std_compute_stats somehow recursively invokes us (probably not
+	 * possible, but ...)
+	 */
+	array_extra_data = extra_data;
+
+	/*
+	 * We want statistics_target * 10 elements in the MCELEM array. This
+	 * multiplier is pretty arbitrary, but is meant to reflect the fact that
+	 * the number of individual elements tracked in pg_statistic ought to be
+	 * more than the number of values for a simple scalar column.
+	 */
+	num_mcelem = stats->attr->attstattarget * 10;
+
+	/*
+	 * We set bucket width equal to num_mcelem / 0.007 as per the comment
+	 * above.
+	 */
+	bucket_width = num_mcelem * 1000 / 7;
+
+	/*
+	 * Create the hashtable. It will be in local memory, so we don't need to
+	 * worry about overflowing the initial size. Also we don't need to pay any
+	 * attention to locking and memory management.
+	 */
+	MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl));
+	elem_hash_ctl.keysize = sizeof(Datum);
+	elem_hash_ctl.entrysize = sizeof(TrackItem);
+	elem_hash_ctl.hash = element_hash;
+	elem_hash_ctl.match = element_match;
+	elem_hash_ctl.hcxt = CurrentMemoryContext;
+	elements_tab = hash_create("Analyzed elements table",
+							   bucket_width * 7,
+							   &elem_hash_ctl,
+					HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
+
+	/* hashtable for array distinct elements counts */
+	MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl));
+	count_hash_ctl.keysize = sizeof(int);
+	count_hash_ctl.entrysize = sizeof(DECountItem);
+	count_hash_ctl.hash = tag_hash;
+	count_hash_ctl.hcxt = CurrentMemoryContext;
+	count_tab = hash_create("Array distinct element count table",
+							64,
+							&count_hash_ctl,
+							HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+
+	/* Initialize counters. */
+	b_current = 1;
+	element_no = 0;
+
+	/* Loop over the arrays. */
+	for (array_no = 0; array_no < samplerows; array_no++)
+	{
+		Datum		value;
+		bool		isnull;
+		ArrayType  *array;
+		int			num_elems;
+		Datum	   *elem_values;
+		bool	   *elem_nulls;
+		bool		null_present;
+		int			j;
+		int64		prev_element_no = element_no;
+		int			distinct_count;
+		bool		count_item_found;
+
+		vacuum_delay_point();
+
+		value = fetchfunc(stats, array_no, &isnull);
+		if (isnull)
+		{
+			/* array is null, just count that */
+			null_cnt++;
+			continue;
+		}
+
+		/* Skip too-large values. */
+		if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD)
+			continue;
+		else
+			analyzed_rows++;
+
+		/*
+		 * Now detoast the array if needed, and deconstruct into datums.
+		 */
+		array = DatumGetArrayTypeP(value);
+
+		Assert(ARR_ELEMTYPE(array) == extra_data->type_id);
+		deconstruct_array(array,
+						  extra_data->type_id,
+						  extra_data->typlen,
+						  extra_data->typbyval,
+						  extra_data->typalign,
+						  &elem_values, &elem_nulls, &num_elems);
+
+		/*
+		 * We loop through the elements in the array and add them to our
+		 * tracking hashtable.
+		 */
+		null_present = false;
+		for (j = 0; j < num_elems; j++)
+		{
+			Datum		elem_value;
+			bool		found;
+
+			/* No null element processing other than flag setting here */
+			if (elem_nulls[j])
+			{
+				null_present = true;
+				continue;
+			}
+
+			/* Lookup current element in hashtable, adding it if new */
+			elem_value = elem_values[j];
+			item = (TrackItem *) hash_search(elements_tab,
+											 (const void *) &elem_value,
+											 HASH_ENTER, &found);
+
+			if (found)
+			{
+				/* The element value is already on the tracking list */
+
+				/*
+				 * The operators we assist ignore duplicate array elements,
+				 * so count a given distinct element only once per array.
+				 */
+				if (item->last_container == array_no)
+					continue;
+
+				item->frequency++;
+				item->last_container = array_no;
+			}
+			else
+			{
+				/* Initialize new tracking list element */
+
+				/*
+				 * If element type is pass-by-reference, we must copy it
+				 * into palloc'd space, so that we can release the array
+				 * below.  (We do this so that the space needed for element
+				 * values is limited by the size of the hashtable; if we
+				 * kept all the array values around, it could be much more.)
+				 */
+				item->key = datumCopy(elem_value,
+									  extra_data->typbyval,
+									  extra_data->typlen);
+
+				item->frequency = 1;
+				item->delta = b_current - 1;
+				item->last_container = array_no;
+			}
+
+			/* element_no is the number of elements processed (ie N) */
+			element_no++;
+
+			/* We prune the D structure after processing each bucket */
+			if (element_no % bucket_width == 0)
+			{
+				prune_element_hashtable(elements_tab, b_current);
+				b_current++;
+			}
+		}
+
+		/* Count null element presence once per array. */
+		if (null_present)
+			null_elem_cnt++;
+
+		/* Update frequency of the particular array distinct element count. */
+		distinct_count = (int) (element_no - prev_element_no);
+		count_item = (DECountItem *) hash_search(count_tab, &distinct_count,
+												 HASH_ENTER,
+												 &count_item_found);
+
+		if (count_item_found)
+			count_item->frequency++;
+		else
+			count_item->frequency = 1;
+
+		/* Free memory allocated while detoasting. */
+		if (PointerGetDatum(array) != value)
+			pfree(array);
+		pfree(elem_values);
+		pfree(elem_nulls);
+	}
+
+	/* Skip pg_statistic slots occupied by standard statistics */
+	slot_idx = 0;
+	while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0)
+		slot_idx++;
+	if (slot_idx > STATISTIC_NUM_SLOTS - 2)
+		elog(ERROR, "insufficient pg_statistic slots for array stats");
+
+	/* We can only compute real stats if we found some non-null values. */
+	if (analyzed_rows > 0)
+	{
+		int			nonnull_cnt = analyzed_rows;
+		int			count_items_count;
+		int			i;
+		TrackItem **sort_table;
+		int			track_len;
+		int64		cutoff_freq;
+		int64		minfreq,
+					maxfreq;
+
+		/*
+		 * We assume the standard stats code already took care of setting
+		 * stats_valid, stanullfrac, stawidth, stadistinct.  We'd have to
+		 * re-compute those values if we wanted to not store the standard
+		 * stats.
+		 */
+
+		/*
+		 * Construct an array of the interesting hashtable items, that is,
+		 * those meeting the cutoff frequency (s - epsilon)*N.	Also identify
+		 * the minimum and maximum frequencies among these items.
+		 *
+		 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
+		 * frequency is 9*N / bucket_width.
+		 */
+		cutoff_freq = 9 * element_no / bucket_width;
+
+		i = hash_get_num_entries(elements_tab); /* surely enough space */
+		sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
+
+		hash_seq_init(&scan_status, elements_tab);
+		track_len = 0;
+		minfreq = element_no;
+		maxfreq = 0;
+		while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
+		{
+			if (item->frequency > cutoff_freq)
+			{
+				sort_table[track_len++] = item;
+				minfreq = Min(minfreq, item->frequency);
+				maxfreq = Max(maxfreq, item->frequency);
+			}
+		}
+		Assert(track_len <= i);
+
+		/* emit some statistics for debug purposes */
+		elog(DEBUG3, "compute_array_stats: target # mces = %d, "
+			 "bucket width = %d, "
+			 "# elements = " INT64_FORMAT ", hashtable size = %d, "
+			 "usable entries = %d",
+			 num_mcelem, bucket_width, element_no, i, track_len);
+
+		/*
+		 * If we obtained more elements than we really want, get rid of those
+		 * with least frequencies.	The easiest way is to qsort the array into
+		 * descending frequency order and truncate the array.
+		 */
+		if (num_mcelem < track_len)
+		{
+			qsort(sort_table, track_len, sizeof(TrackItem *),
+				  trackitem_compare_frequencies_desc);
+			/* reset minfreq to the smallest frequency we're keeping */
+			minfreq = sort_table[num_mcelem - 1]->frequency;
+		}
+		else
+			num_mcelem = track_len;
+
+		/* Generate MCELEM slot entry */
+		if (num_mcelem > 0)
+		{
+			MemoryContext old_context;
+			Datum	   *mcelem_values;
+			float4	   *mcelem_freqs;
+
+			/*
+			 * We want to store statistics sorted on the element value using
+			 * the element type's default comparison function.  This permits
+			 * fast binary searches in selectivity estimation functions.
+			 */
+			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
+				  trackitem_compare_element);
+
+			/* Must copy the target values into anl_context */
+			old_context = MemoryContextSwitchTo(stats->anl_context);
+
+			/*
+			 * We sorted statistics on the element value, but we want to be
+			 * able to find the minimal and maximal frequencies without going
+			 * through all the values.	We also want the frequency of null
+			 * elements.  Store these three values at the end of mcelem_freqs.
+			 */
+			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
+			mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4));
+
+			/*
+			 * See comments above about use of nonnull_cnt as the divisor for
+			 * the final frequency estimates.
+			 */
+			for (i = 0; i < num_mcelem; i++)
+			{
+				TrackItem  *item = sort_table[i];
+
+				mcelem_values[i] = datumCopy(item->key,
+											 extra_data->typbyval,
+											 extra_data->typlen);
+				mcelem_freqs[i] = (double) item->frequency /
+					(double) nonnull_cnt;
+			}
+			mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
+			mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt;
+			mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt;
+
+			MemoryContextSwitchTo(old_context);
+
+			stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM;
+			stats->staop[slot_idx] = extra_data->eq_opr;
+			stats->stanumbers[slot_idx] = mcelem_freqs;
+			/* See above comment about extra stanumber entries */
+			stats->numnumbers[slot_idx] = num_mcelem + 3;
+			stats->stavalues[slot_idx] = mcelem_values;
+			stats->numvalues[slot_idx] = num_mcelem;
+			/* We are storing values of element type */
+			stats->statypid[slot_idx] = extra_data->type_id;
+			stats->statyplen[slot_idx] = extra_data->typlen;
+			stats->statypbyval[slot_idx] = extra_data->typbyval;
+			stats->statypalign[slot_idx] = extra_data->typalign;
+			slot_idx++;
+		}
+
+		/* Generate DECHIST slot entry */
+		count_items_count = hash_get_num_entries(count_tab);
+		if (count_items_count > 0)
+		{
+			int			num_hist = stats->attr->attstattarget;
+			DECountItem **sorted_count_items;
+			int			count_item_index;
+			int			delta;
+			int			frac;
+			float4	   *hist;
+
+			/* num_hist must be at least 2 for the loop below to work */
+			num_hist = Max(num_hist, 2);
+
+			/*
+			 * Create an array of DECountItem pointers, and sort them into
+			 * increasing count order.
+			 */
+			sorted_count_items = (DECountItem **)
+				palloc(sizeof(DECountItem *) * count_items_count);
+			hash_seq_init(&scan_status, count_tab);
+			count_item_index = 0;
+			while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
+			{
+				sorted_count_items[count_item_index++] = count_item;
+			}
+			qsort(sorted_count_items, count_items_count,
+				  sizeof(DECountItem *), countitem_compare_count);
+
+			/*
+			 * Fill stanumbers with the histogram, followed by the average
+			 * count.  This array must be stored in anl_context.
+			 */
+			hist = (float4 *)
+				MemoryContextAlloc(stats->anl_context,
+								   sizeof(float4) * (num_hist + 1));
+			hist[num_hist] = (double) element_no / (double) nonnull_cnt;
+
+			/*
+			 * Construct the histogram.
+			 *
+			 * XXX this needs work: frac could overflow, and it's not clear
+			 * how or why the code works.  Even if it does work, it needs
+			 * documented.
+			 */
+			delta = analyzed_rows - 1;
+			count_item_index = 0;
+			frac = sorted_count_items[0]->frequency * (num_hist - 1);
+			for (i = 0; i < num_hist; i++)
+			{
+				while (frac <= 0)
+				{
+					count_item_index++;
+					Assert(count_item_index < count_items_count);
+					frac += sorted_count_items[count_item_index]->frequency * (num_hist - 1);
+				}
+				hist[i] = sorted_count_items[count_item_index]->count;
+				frac -= delta;
+			}
+			Assert(count_item_index == count_items_count - 1);
+
+			stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
+			stats->staop[slot_idx] = extra_data->eq_opr;
+			stats->stanumbers[slot_idx] = hist;
+			stats->numnumbers[slot_idx] = num_hist + 1;
+			slot_idx++;
+		}
+	}
+
+	/*
+	 * We don't need to bother cleaning up any of our temporary palloc's. The
+	 * hashtable should also go away, as it used a child memory context.
+	 */
+}
+
+/*
+ * A function to prune the D structure from the Lossy Counting algorithm.
+ * Consult compute_tsvector_stats() for wider explanation.
+ */
+static void
+prune_element_hashtable(HTAB *elements_tab, int b_current)
+{
+	HASH_SEQ_STATUS scan_status;
+	TrackItem  *item;
+
+	hash_seq_init(&scan_status, elements_tab);
+	while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
+	{
+		if (item->frequency + item->delta <= b_current)
+		{
+			Datum		value = item->key;
+
+			if (hash_search(elements_tab, (const void *) &item->key,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "hash table corrupted");
+			/* We should free memory if element is not passed by value */
+			if (!array_extra_data->typbyval)
+				pfree(DatumGetPointer(value));
+		}
+	}
+}
+
+/*
+ * Hash function for elements.
+ *
+ * We use the element type's default hash opclass, and the default collation
+ * if the type is collation-sensitive.
+ */
+static uint32
+element_hash(const void *key, Size keysize)
+{
+	Datum		d = *((const Datum *) key);
+	Datum		h;
+
+	h = FunctionCall1Coll(array_extra_data->hash, DEFAULT_COLLATION_OID, d);
+	return DatumGetUInt32(h);
+}
+
+/*
+ * Matching function for elements, to be used in hashtable lookups.
+ */
+static int
+element_match(const void *key1, const void *key2, Size keysize)
+{
+	/* The keysize parameter is superfluous here */
+	return element_compare(key1, key2);
+}
+
+/*
+ * Comparison function for elements.
+ *
+ * We use the element type's default btree opclass, and the default collation
+ * if the type is collation-sensitive.
+ *
+ * XXX consider using SortSupport infrastructure
+ */
+static int
+element_compare(const void *key1, const void *key2)
+{
+	Datum		d1 = *((const Datum *) key1);
+	Datum		d2 = *((const Datum *) key2);
+	Datum		c;
+
+	c = FunctionCall2Coll(array_extra_data->cmp, DEFAULT_COLLATION_OID, d1, d2);
+	return DatumGetInt32(c);
+}
+
+/*
+ * qsort() comparator for sorting TrackItems by frequencies (descending sort)
+ */
+static int
+trackitem_compare_frequencies_desc(const void *e1, const void *e2)
+{
+	const TrackItem *const * t1 = (const TrackItem *const *) e1;
+	const TrackItem *const * t2 = (const TrackItem *const *) e2;
+
+	return (*t2)->frequency - (*t1)->frequency;
+}
+
+/*
+ * qsort() comparator for sorting TrackItems by element values
+ */
+static int
+trackitem_compare_element(const void *e1, const void *e2)
+{
+	const TrackItem *const * t1 = (const TrackItem *const *) e1;
+	const TrackItem *const * t2 = (const TrackItem *const *) e2;
+
+	return element_compare(&(*t1)->key, &(*t2)->key);
+}
+
+/*
+ * qsort() comparator for sorting DECountItems by count
+ */
+static int
+countitem_compare_count(const void *e1, const void *e2)
+{
+	const DECountItem * const *t1 = (const DECountItem * const *) e1;
+	const DECountItem * const *t2 = (const DECountItem * const *) e2;
+
+	if ((*t1)->count < (*t2)->count)
+		return -1;
+	else if ((*t1)->count == (*t2)->count)
+		return 0;
+	else
+		return 1;
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 0a685aac2c..382cd7372b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -127,6 +127,7 @@
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
 #include "utils/tqual.h"
+#include "utils/typcache.h"
 
 
 /* Hooks for plugins to get control when we ask for stats */
@@ -1701,27 +1702,18 @@ scalararraysel(PlannerInfo *root,
 {
 	Oid			operator = clause->opno;
 	bool		useOr = clause->useOr;
+	bool		isEquality = false;
+	bool		isInequality = false;
 	Node	   *leftop;
 	Node	   *rightop;
 	Oid			nominal_element_type;
 	Oid			nominal_element_collation;
+	TypeCacheEntry *typentry;
 	RegProcedure oprsel;
 	FmgrInfo	oprselproc;
 	Selectivity s1;
 
-	/*
-	 * First, look up the underlying operator's selectivity estimator. Punt if
-	 * it hasn't got one.
-	 */
-	if (is_join_clause)
-		oprsel = get_oprjoin(operator);
-	else
-		oprsel = get_oprrest(operator);
-	if (!oprsel)
-		return (Selectivity) 0.5;
-	fmgr_info(oprsel, &oprselproc);
-
-	/* deconstruct the expression */
+	/* First, deconstruct the expression */
 	Assert(list_length(clause->args) == 2);
 	leftop = (Node *) linitial(clause->args);
 	rightop = (Node *) lsecond(clause->args);
@@ -1737,6 +1729,46 @@ scalararraysel(PlannerInfo *root,
 	rightop = strip_array_coercion(rightop);
 
 	/*
+	 * Detect whether the operator is the default equality or inequality
+	 * operator of the array element type.
+	 */
+	typentry = lookup_type_cache(nominal_element_type, TYPECACHE_EQ_OPR);
+	if (OidIsValid(typentry->eq_opr))
+	{
+		if (operator == typentry->eq_opr)
+			isEquality = true;
+		else if (get_negator(operator) == typentry->eq_opr)
+			isInequality = true;
+	}
+
+	/*
+	 * If it is equality or inequality, we might be able to estimate this as
+	 * a form of array containment; for instance "const = ANY(column)" can be
+	 * treated as "ARRAY[const] <@ column".  scalararraysel_containment tries
+	 * that, and returns the selectivity estimate if successful, or -1 if not.
+	 */
+	if ((isEquality || isInequality) && !is_join_clause)
+	{
+		s1 = scalararraysel_containment(root, leftop, rightop,
+										nominal_element_type,
+										isEquality, useOr, varRelid);
+		if (s1 >= 0.0)
+			return s1;
+	}
+
+	/*
+	 * Look up the underlying operator's selectivity estimator. Punt if it
+	 * hasn't got one.
+	 */
+	if (is_join_clause)
+		oprsel = get_oprjoin(operator);
+	else
+		oprsel = get_oprrest(operator);
+	if (!oprsel)
+		return (Selectivity) 0.5;
+	fmgr_info(oprsel, &oprselproc);
+
+	/*
 	 * We consider three cases:
 	 *
 	 * 1. rightop is an Array constant: deconstruct the array, apply the
author	Tom Lane <tgl@sss.pgh.pa.us>	2012-03-03 20:20:19 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2012-03-03 20:20:57 -0500
commit	0e5e167aaea4ceb355a6e20eec96c4f7d05527ab (patch)
tree	1b1b338461cba27a2d783db13b74d1b7b86b6681 /src/backend
parent	34c978442c55dd13a3a8c6b90fd4380dad02f3da (diff)
download	postgresql-0e5e167aaea4ceb355a6e20eec96c4f7d05527ab.tar.gz