Add support for multivariate MCV lists

Introduce a third extended statistic type, supported by the CREATE STATISTICS command - MCV lists, a generalization of the statistic already built and used for individual columns. Compared to the already supported types (n-distinct coefficients and functional dependencies), MCV lists are more complex, include column values and allow estimation of much wider range of common clauses (equality and inequality conditions, IS NULL, IS NOT NULL etc.). Similarly to the other types, a new pseudo-type (pg_mcv_list) is used. Author: Tomas Vondra Reviewed-by: Dean Rasheed, David Rowley, Mark Dilger, Alvaro Herrera Discussion: https://postgr.es/m/dfdac334-9cf2-2597-fb27-f0fb3753f435@2ndquadrant.com

Add support for multivariate MCV lists
Introduce a third extended statistic type, supported by the CREATE STATISTICS command - MCV lists, a generalization of the statistic already built and used for individual columns. Compared to the already supported types (n-distinct coefficients and functional dependencies), MCV lists are more complex, include column values and allow estimation of much wider range of common clauses (equality and inequality conditions, IS NULL, IS NOT NULL etc.). Similarly to the other types, a new pseudo-type (pg_mcv_list) is used. Author: Tomas Vondra Reviewed-by: Dean Rasheed, David Rowley, Mark Dilger, Alvaro Herrera Discussion: https://postgr.es/m/dfdac334-9cf2-2597-fb27-f0fb3753f435@2ndquadrant.com
7300a699 · Tomas Vondra · 333ed246 · 7300a699 · 7300a699 · 7300a699
Commit 7300a699 authored Mar 27, 2019 by Tomas Vondra
32 changed files
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -6562,7 +6562,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        An array containing codes for the enabled statistic kinds;
        valid values are:
        <literal>d</literal> for n-distinct statistics,
-        <literal>f</literal> for functional dependency statistics
+        <literal>f</literal> for functional dependency statistics, and
+        <literal>m</literal> for most common values (MCV) list statistics
      </entry>
     </row>
@@ -6585,6 +6586,16 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
      </entry>
     </row>
+     <row>
+      <entry><structfield>stxmcv</structfield></entry>
+      <entry><type>pg_mcv_list</type></entry>
+      <entry></entry>
+      <entry>
+       MCV (most-common values) list statistics, serialized as
+       <structname>pg_mcv_list</structname> type.
+      </entry>
+     </row>
    </tbody>
   </tgroup>
  </table>

--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -22174,4 +22174,86 @@ CREATE EVENT TRIGGER test_table_rewrite_oid
  </sect2>
  </sect1>
+  <sect1 id="functions-statistics">
+   <title>Statistics Information Functions</title>
+   <indexterm zone="functions-statistics">
+    <primary>function</primary>
+    <secondary>statistics</secondary>
+   </indexterm>
+   <para>
+    <productname>PostgreSQL</productname> provides a function to inspect complex
+    statistics defined using the <command>CREATE STATISTICS</command> command.
+   </para>
+  <sect2 id="functions-statistics-mcv">
+   <title>Inspecting MCV lists</title>
+   <indexterm>
+     <primary>pg_mcv_list_items</primary>
+     <secondary>pg_mcv_list</secondary>
+   </indexterm>
+   <para>
+    <function>pg_mcv_list_items</function> returns a list of all items
+    stored in a multi-column <acronym>MCV</acronym> list, and returns the
+    following columns:
+    <informaltable>
+     <tgroup cols="3">
+      <thead>
+       <row>
+        <entry>Name</entry>
+        <entry>Type</entry>
+        <entry>Description</entry>
+       </row>
+      </thead>
+      <tbody>
+       <row>
+        <entry><literal>index</literal></entry>
+        <entry><type>int</type></entry>
+        <entry>index of the item in the <acronym>MCV</acronym> list</entry>
+       </row>
+       <row>
+        <entry><literal>values</literal></entry>
+        <entry><type>text[]</type></entry>
+        <entry>values stored in the MCV item</entry>
+       </row>
+       <row>
+        <entry><literal>nulls</literal></entry>
+        <entry><type>boolean[]</type></entry>
+        <entry>flags identifying <literal>NULL</literal> values</entry>
+       </row>
+       <row>
+        <entry><literal>frequency</literal></entry>
+        <entry><type>double precision</type></entry>
+        <entry>frequency of this <acronym>MCV</acronym> item</entry>
+       </row>
+       <row>
+        <entry><literal>base_frequency</literal></entry>
+        <entry><type>double precision</type></entry>
+        <entry>base frequency of this <acronym>MCV</acronym> item</entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </informaltable>
+   </para>
+   <para>
+    The <function>pg_mcv_list_items</function> function can be used like this:
+<programlisting>
+SELECT m.* FROM pg_statistic_ext,
+                pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts';
+</programlisting>
+     Values of the <type>pg_mcv_list</type> can be obtained only from the
+     <literal>pg_statistic_ext.stxmcv</literal> column.
+   </para>
+  </sect2>
+  </sect1>
 </chapter>
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1285,6 +1285,72 @@ nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
     plans.  Otherwise, the <command>ANALYZE</command> cycles are just wasted.
    </para>
   </sect3>
+   <sect3>
+    <title>Multivariate MCV lists</title>
+    <para>
+     Another type of statistics stored for each column are most-common value
+     lists.  This allows very accurate estimates for individual columns, but
+     may result in significant misestimates for queries with conditions on
+     multiple columns.
+    </para>
+    <para>
+     To improve such estimates, <command>ANALYZE</command> can collect MCV
+     lists on combinations of columns.  Similarly to functional dependencies
+     and n-distinct coefficients, it's impractical to do this for every
+     possible column grouping.  Even more so in this case, as the MCV list
+     (unlike functional dependencies and n-distinct coefficients) does store
+     the common column values.  So data is collected only for those groups
+     of columns appearing together in a statistics object defined with the
+     <literal>mcv</literal> option.
+    </para>
+    <para>
+     Continuing the previous example, the MCV list for a table of ZIP codes
+     might look like the following (unlike for simpler types of statistics,
+     a function is required for inspection of MCV contents):
+<programlisting>
+CREATE STATISTICS stts3 (mcv) ON state, city FROM zipcodes;
+ANALYZE zipcodes;
+SELECT m.* FROM pg_statistic_ext,
+                pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts3';
+ index |         values         | nulls | frequency | base_frequency 
+-------+------------------------+-------+-----------+----------------
+     0 | {Washington, DC}       | {f,f} |  0.003467 |        2.7e-05
+     1 | {Apo, AE}              | {f,f} |  0.003067 |        1.9e-05
+     2 | {Houston, TX}          | {f,f} |  0.002167 |       0.000133
+     3 | {El Paso, TX}          | {f,f} |     0.002 |       0.000113
+     4 | {New York, NY}         | {f,f} |  0.001967 |       0.000114
+     5 | {Atlanta, GA}          | {f,f} |  0.001633 |        3.3e-05
+     6 | {Sacramento, CA}       | {f,f} |  0.001433 |        7.8e-05
+     7 | {Miami, FL}            | {f,f} |    0.0014 |          6e-05
+     8 | {Dallas, TX}           | {f,f} |  0.001367 |        8.8e-05
+     9 | {Chicago, IL}          | {f,f} |  0.001333 |        5.1e-05
+   ...
+(99 rows)
+</programlisting>
+     This indicates that the most common combination of city and state is
+     Washington in DC, with actual frequency (in the sample) about 0.35%.
+     The base frequency of the combination (as computed from the simple
+     per-column frequencies) is only 0.0027%, resulting in two orders of
+     magnitude under-estimates.
+    </para>
+    <para>
+     It's advisable to create <acronym>MCV</acronym> statistics objects only
+     on combinations of columns that are actually used in conditions together,
+     and for which misestimation of the number of groups is resulting in bad
+     plans.  Otherwise, the <command>ANALYZE</command> and planning cycles
+     are just wasted.
+    </para>
+   </sect3>
  </sect2>
 </sect1>

--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -455,7 +455,7 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
   <secondary>multivariate</secondary>
  </indexterm>
-  <sect2>
+  <sect2 id="functional-dependencies">
   <title>Functional Dependencies</title>
   <para>
@@ -540,7 +540,7 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
   </para>
  </sect2>
-  <sect2>
+  <sect2 id="multivariate-ndistinct-counts">
   <title>Multivariate N-Distinct Counts</title>
   <para>
@@ -585,6 +585,118 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
   </para>
  </sect2>
+  <sect2 id="mcv-lists">
+   <title>MCV lists</title>
+   <para>
+    As explained in <xref linkend="functional-dependencies"/>, functional
+    dependencies are very cheap and efficient type of statistics, but their
+    main limitation is their global nature (only tracking dependencies at
+    the column level, not between individual column values).
+   </para>
+   <para>
+    This section introduces multivariate variant of <acronym>MCV</acronym>
+    (most-common values) lists, a straightforward extension of the per-column
+    statistics described in <xref linkend="row-estimation-examples"/>.  These
+    statistics address the limitation by storing individual values, but it is
+    naturally more expensive, both in terms of building the statistics in
+    <command>ANALYZE</command>, storage and planning time.
+   </para>
+   <para>
+    Let's look at the query from <xref linkend="functional-dependencies"/>
+    again, but this time with a <acronym>MCV</acronym> list created on the
+    same set of columns (be sure to drop the functional dependencies, to
+    make sure the planner uses the newly created statistics).
+<programlisting>
+DROP STATISTICS stts;
+CREATE STATISTICS stts2 (mcv) ON a, b FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
+                                   QUERY PLAN
+-------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=100 width=8) (actual rows=100 loops=1)
+   Filter: ((a = 1) AND (b = 1))
+   Rows Removed by Filter: 9900
+</programlisting>
+    The estimate is as accurate as with the functional dependencies, mostly
+    thanks to the table being fairly small and having a simple distribution
+    with a low number of distinct values. Before looking at the second query,
+    which was not handled by functional dependencies particularly well,
+    let's inspect the <acronym>MCV</acronym> list a bit.
+   </para>
+   <para>
+    Inspecting the <acronym>MCV</acronym> list is possible using
+    <function>pg_mcv_list_items</function> set-returning function.
+<programlisting>
+SELECT m.* FROM pg_statistic_ext,
+                pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts2';
+ index |  values  | nulls | frequency | base_frequency 
+-------+----------+-------+-----------+----------------
+     0 | {0, 0}   | {f,f} |      0.01 |         0.0001
+     1 | {1, 1}   | {f,f} |      0.01 |         0.0001
+   ...
+    49 | {49, 49} | {f,f} |      0.01 |         0.0001
+    50 | {50, 50} | {f,f} |      0.01 |         0.0001
+   ...
+    97 | {97, 97} | {f,f} |      0.01 |         0.0001
+    98 | {98, 98} | {f,f} |      0.01 |         0.0001
+    99 | {99, 99} | {f,f} |      0.01 |         0.0001
+(100 rows)
+</programlisting>
+    This confirms there are 100 distinct combinations in the two columns, and
+    all of them are about equally likely (1% frequency for each one).  The
+    base frequency is the frequency computed from per-column statistics, as if
+    there were no multi-column statistics. Had there been any null values in
+    either of the columns, this would be identified in the
+    <structfield>nulls</structfield> column.
+   </para>
+   <para>
+    When estimating the selectivity, the planner applies all the conditions
+    on items in the <acronym>MCV</acronym> list, and then sums the frequencies
+    of the matching ones. See <function>mcv_clauselist_selectivity</function>
+    in <filename>src/backend/statistics/mcv.c</filename> for details.
+   </para>
+   <para>
+    Compared to functional dependencies, <acronym>MCV</acronym> lists have two
+    major advantages. Firstly, the list stores actual values, making it possible
+    to decide which combinations are compatible.
+<programlisting>
+EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10;
+                                 QUERY PLAN
+---------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=1 width=8) (actual rows=0 loops=1)
+   Filter: ((a = 1) AND (b = 10))
+   Rows Removed by Filter: 10000
+</programlisting>
+    Secondly, <acronym>MCV</acronym> lists handle a wider range of clause types,
+    not just equality clauses like functional dependencies. See for example the
+    example range query, presented earlier:
+<programlisting>
+EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a &lt;= 49 AND b &gt; 49;
+                                QUERY PLAN
+---------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=1 width=8) (actual rows=0 loops=1)
+   Filter: ((a &lt;= 49) AND (b &gt; 49))
+   Rows Removed by Filter: 10000
+</programlisting>
+   </para>
+  </sect2>
 </sect1>
 <sect1 id="planner-stats-security">

--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -81,9 +81,10 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
     <para>
      A statistics kind to be computed in this statistics object.
      Currently supported kinds are
-      <literal>ndistinct</literal>, which enables n-distinct statistics, and
+      <literal>ndistinct</literal>, which enables n-distinct statistics,
      <literal>dependencies</literal>, which enables functional
-      dependency statistics.
+      dependency statistics, and <literal>mcv</literal> which enables
+      most-common values lists.
      If this clause is omitted, all supported statistics kinds are
      included in the statistics object.
      For more information, see <xref linkend="planner-stats-extended"/>
@@ -164,6 +165,36 @@ EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
   conditions are redundant and does not underestimate the row count.
  </para>
+  <para>
+   Create table <structname>t2</structname> with two perfectly correlated columns
+   (containing identical data), and a MCV list on those columns:
+<programlisting>
+CREATE TABLE t2 (
+    a   int,
+    b   int
+);
+INSERT INTO t2 SELECT mod(i,100), mod(i,100)
+                 FROM generate_series(1,1000000) s(i);
+CREATE STATISTICS s2 (mcv) ON (a, b) FROM t2;
+ANALYZE t2;
+-- valid combination (found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 1);
+-- invalid combination (not found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
+</programlisting>
+   The MCV list gives the planner more detailed information about the
+   specific values that commonly appear in the table, as well as an upper
+   bound on the selectivities of combinations of values that do not appear
+   in the table, allowing it to generate better estimates in both cases.
+  </para>
 </refsect1>
 <refsect1>

--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -73,11 +73,12 @@ CreateStatistics(CreateStatsStmt *stmt)
 	Oid			relid;
 	ObjectAddress parentobject,
 				myself;
-	Datum		types[2];		/* one for each possible type of statistic */
+	Datum		types[3];		/* one for each possible type of statistic */
 	int			ntypes;
 	ArrayType  *stxkind;
 	bool		build_ndistinct;
 	bool		build_dependencies;
+	bool		build_mcv;
 	bool		requested_type = false;
 	int			i;
 	ListCell   *cell;
@@ -272,6 +273,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	 */
 	build_ndistinct = false;
 	build_dependencies = false;
+	build_mcv = false;
 	foreach(cell, stmt->stat_types)
 	{
 		char	   *type = strVal((Value *) lfirst(cell));
@@ -286,6 +288,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 			build_dependencies = true;
 			requested_type = true;
 		}
+		else if (strcmp(type, "mcv") == 0)
+		{
+			build_mcv = true;
+			requested_type = true;
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -297,6 +304,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	{
 		build_ndistinct = true;
 		build_dependencies = true;
+		build_mcv = true;
 	}
 	/* construct the char array of enabled statistic types */
@@ -305,6 +313,8 @@ CreateStatistics(CreateStatsStmt *stmt)
 		types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
 	if (build_dependencies)
 		types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
+	if (build_mcv)
+		types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
 	Assert(ntypes > 0 && ntypes <= lengthof(types));
 	stxkind = construct_array(types, ntypes, CHAROID, 1, true, 'c');
@@ -329,6 +339,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	/* no statistics built yet */
 	nulls[Anum_pg_statistic_ext_stxndistinct - 1] = true;
 	nulls[Anum_pg_statistic_ext_stxdependencies - 1] = true;
+	nulls[Anum_pg_statistic_ext_stxmcv - 1] = true;
 	/* insert it into pg_statistic_ext */
 	htup = heap_form_tuple(statrel->rd_att, values, nulls);
@@ -424,23 +435,72 @@ RemoveStatisticsById(Oid statsOid)
 * null until the next ANALYZE.  (Note that the type change hasn't actually
 * happened yet, so one option that's *not* on the table is to recompute
 * immediately.)
+ *
+ * For both ndistinct and functional-dependencies stats, the on-disk
+ * representation is independent of the source column data types, and it is
+ * plausible to assume that the old statistic values will still be good for
+ * the new column contents.  (Obviously, if the ALTER COLUMN TYPE has a USING
+ * expression that substantially alters the semantic meaning of the column
+ * values, this assumption could fail.  But that seems like a corner case
+ * that doesn't justify zapping the stats in common cases.)
+ *
+ * For MCV lists that's not the case, as those statistics store the datums
+ * internally. In this case we simply reset the statistics value to NULL.
 */
 void
 UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
 							  Oid oldColumnType, Oid newColumnType)
 {
+	HeapTuple	stup,
+				oldtup;
+	Relation	rel;
+	Datum		values[Natts_pg_statistic_ext];
+	bool		nulls[Natts_pg_statistic_ext];
+	bool		replaces[Natts_pg_statistic_ext];
+	oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statsOid));
+	if (!oldtup)
+		elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
 	/*
-	 * Currently, we don't actually need to do anything here.  For both
+	 * When none of the defined statistics types contain datum values
-	 * ndistinct and functional-dependencies stats, the on-disk representation
+	 * from the table's columns then there's no need to reset the stats.
-	 * is independent of the source column data types, and it is plausible to
+	 * Functional dependencies and ndistinct stats should still hold true.
-	 * assume that the old statistic values will still be good for the new
-	 * column contents.  (Obviously, if the ALTER COLUMN TYPE has a USING
-	 * expression that substantially alters the semantic meaning of the column
-	 * values, this assumption could fail.  But that seems like a corner case
-	 * that doesn't justify zapping the stats in common cases.)
-	 *
-	 * Future types of extended stats will likely require us to work harder.
 	 */
+	if (!statext_is_kind_built(oldtup, STATS_EXT_MCV))
+	{
+		ReleaseSysCache(oldtup);
+		return;
+	}
+	/*
+	 * OK, we need to reset some statistics. So let's build the new tuple,
+	 * replacing the affected statistics types with NULL.
+	 */
+	memset(nulls, 0, Natts_pg_statistic_ext * sizeof(bool));
+	memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool));
+	memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum));
+	replaces[Anum_pg_statistic_ext_stxmcv - 1] = true;
+	nulls[Anum_pg_statistic_ext_stxmcv - 1] = true;
+	rel = heap_open(StatisticExtRelationId, RowExclusiveLock);
+	/* replace the old tuple */
+	stup = heap_modify_tuple(oldtup,
+							 RelationGetDescr(rel),
+							 values,
+							 nulls,
+							 replaces);
+	ReleaseSysCache(oldtup);
+	CatalogTupleUpdate(rel, &stup->t_self, stup);
+	heap_freetuple(stup);
+	heap_close(rel, RowExclusiveLock);
 }
 /*

--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -443,6 +443,50 @@ bms_is_member(int x, const Bitmapset *a)
 	return false;
 }
+/*
+ * bms_member_index
+ *		determine 0-based index of member x in the bitmap
+ *
+ * Returns (-1) when x is not a member.
+ */
+int
+bms_member_index(Bitmapset *a, int x)
+{
+	int			i;
+	int			bitnum;
+	int			wordnum;
+	int			result = 0;
+	bitmapword	mask;
+	/* return -1 if not a member of the bitmap */
+	if (!bms_is_member(x, a))
+		return -1;
+	wordnum = WORDNUM(x);
+	bitnum = BITNUM(x);
+	/* count bits in preceding words */
+	for (i = 0; i < wordnum; i++)
+	{
+		bitmapword	w = a->words[i];
+		/* No need to count the bits in a zero word */
+		if (w != 0)
+			result += bmw_popcount(w);
+	}
+	/*
+	 * Now add bits of the last word, but only those before the item.
+	 * We can do that by applying a mask and then using popcount again.
+	 * To get 0-based index, we want to count only preceding bits, not
+	 * the item itself, so we subtract 1.
+	 */
+	mask = ((bitmapword) 1 << bitnum) - 1;
+	result += bmw_popcount(a->words[wordnum] & mask);
+	return result;
+}
 /*
 * bms_overlap - do sets overlap (ie, have a nonempty intersection)?
 */

--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -60,17 +60,67 @@ static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root,
 *
 * See clause_selectivity() for the meaning of the additional parameters.
 *
+ * The basic approach is to apply extended statistics first, on as many
+ * clauses as possible, in order to capture cross-column dependencies etc.
+ * The remaining clauses are then estimated using regular statistics tracked
+ * for individual columns.  This is done by simply passing the clauses to
+ * clauselist_selectivity_simple.
+ */
+Selectivity
+clauselist_selectivity(PlannerInfo *root,
+					   List *clauses,
+					   int varRelid,
+					   JoinType jointype,
+					   SpecialJoinInfo *sjinfo)
+{
+	Selectivity s1 = 1.0;
+	RelOptInfo *rel;
+	Bitmapset  *estimatedclauses = NULL;
+	/*
+	 * Determine if these clauses reference a single relation.  If so, and if
+	 * it has extended statistics, try to apply those.
+	 */
+	rel = find_single_rel_for_clauses(root, clauses);
+	if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+	{
+		/*
+		 * Estimate as many clauses as possible using extended statistics.
+		 *
+		 * 'estimatedclauses' tracks the 0-based list position index of
+		 * clauses that we've estimated using extended statistics, and that
+		 * should be ignored.
+		 */
+		s1 *= statext_clauselist_selectivity(root, clauses, varRelid,
+											 jointype, sjinfo, rel,
+											 &estimatedclauses);
+	}
+	/*
+	 * Apply normal selectivity estimates for the remaining clauses, passing
+	 * 'estimatedclauses' so that it skips already estimated ones.
+	 */
+	return s1 * clauselist_selectivity_simple(root, clauses, varRelid,
+											  jointype, sjinfo,
+											  estimatedclauses);
+}
+/*
+ * clauselist_selectivity_simple -
+ *	  Compute the selectivity of an implicitly-ANDed list of boolean
+ *	  expression clauses.  The list can be empty, in which case 1.0
+ *	  must be returned.  List elements may be either RestrictInfos
+ *	  or bare expression clauses --- the former is preferred since
+ *	  it allows caching of results.  The estimatedclauses bitmap tracks
+ *	  clauses that have already been estimated by other means.
+ *
+ * See clause_selectivity() for the meaning of the additional parameters.
+ *
 * Our basic approach is to take the product of the selectivities of the
 * subclauses.  However, that's only right if the subclauses have independent
 * probabilities, and in reality they are often NOT independent.  So,
 * we want to be smarter where we can.
 *
- * If the clauses taken together refer to just one relation, we'll try to
- * apply selectivity estimates using any extended statistics for that rel.
- * Currently we only have (soft) functional dependencies, so apply these in as
- * many cases as possible, and fall back on normal estimates for remaining
- * clauses.
- *
 * We also recognize "range queries", such as "x > 34 AND x < 42".  Clauses
 * are recognized as possible range query components if they are restriction
 * opclauses whose operators have scalarltsel or a related function as their
@@ -98,54 +148,29 @@ static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root,
 * selectivity functions; perhaps some day we can generalize the approach.
 */
 Selectivity
-clauselist_selectivity(PlannerInfo *root,
+clauselist_selectivity_simple(PlannerInfo *root,
 							  List *clauses,
 							  int varRelid,
 							  JoinType jointype,
-					   SpecialJoinInfo *sjinfo)
+							  SpecialJoinInfo *sjinfo,
+							  Bitmapset *estimatedclauses)
 {
 	Selectivity s1 = 1.0;
-	RelOptInfo *rel;
-	Bitmapset  *estimatedclauses = NULL;
 	RangeQueryClause *rqlist = NULL;
 	ListCell   *l;
 	int			listidx;
 	/*
-	 * If there's exactly one clause, just go directly to
+	 * If there's exactly one clause (and it was not estimated yet), just
-	 * clause_selectivity(). None of what we might do below is relevant.
+	 * go directly to clause_selectivity(). None of what we might do below
+	 * is relevant.
 	 */
-	if (list_length(clauses) == 1)
+	if ((list_length(clauses) == 1) &&
+		bms_num_members(estimatedclauses) == 0)
 		return clause_selectivity(root, (Node *) linitial(clauses),
 								  varRelid, jointype, sjinfo);
 	/*
-	 * Determine if these clauses reference a single relation.  If so, and if
-	 * it has extended statistics, try to apply those.
-	 */
-	rel = find_single_rel_for_clauses(root, clauses);
-	if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
-	{
-		/*
-		 * Perform selectivity estimations on any clauses found applicable by
-		 * dependencies_clauselist_selectivity.  'estimatedclauses' will be
-		 * filled with the 0-based list positions of clauses used that way, so
-		 * that we can ignore them below.
-		 */
-		s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
-												  jointype, sjinfo, rel,
-												  &estimatedclauses);
-		/*
-		 * This would be the place to apply any other types of extended
-		 * statistics selectivity estimations for remaining clauses.
-		 */
-	}
-	/*
-	 * Apply normal selectivity estimates for remaining clauses. We'll be
-	 * careful to skip any clauses which were already estimated above.
-	 *
 	 * Anything that doesn't look like a potential rangequery clause gets
 	 * multiplied into s1 and forgotten. Anything that does gets inserted into
 	 * an rqlist entry.

--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1363,6 +1363,18 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
 			stainfos = lcons(info, stainfos);
 		}
+		if (statext_is_kind_built(htup, STATS_EXT_MCV))
+		{
+			StatisticExtInfo *info = makeNode(StatisticExtInfo);
+			info->statOid = statOid;
+			info->rel = rel;
+			info->kind = STATS_EXT_MCV;
+			info->keys = bms_copy(keys);
+			stainfos = lcons(info, stainfos);
+		}
 		ReleaseSysCache(htup);
 		bms_free(keys);
 	}

--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1652,6 +1652,8 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
 			stat_types = lappend(stat_types, makeString("ndistinct"));
 		else if (enabled[i] == STATS_EXT_DEPENDENCIES)
 			stat_types = lappend(stat_types, makeString("dependencies"));
+		else if (enabled[i] == STATS_EXT_MCV)
+			stat_types = lappend(stat_types, makeString("mcv"));
 		else
 			elog(ERROR, "unrecognized statistics kind %c", enabled[i]);
 	}

--- a/src/backend/statistics/Makefile
+++ b/src/backend/statistics/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/statistics
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
-OBJS = extended_stats.o dependencies.o mvdistinct.o
+OBJS = extended_stats.o dependencies.o mcv.o mvdistinct.o
 include $(top_srcdir)/src/backend/common.mk
--- a/src/backend/statistics/README
+++ b/src/backend/statistics/README
@@ -18,6 +18,8 @@ There are currently two kinds of extended statistics:
    (b) soft functional dependencies (README.dependencies)
+    (c) MCV lists (README.mcv)
 Compatible clause types
 -----------------------
@@ -26,6 +28,8 @@ Each type of statistics may be used to estimate some subset of clause types.
    (a) functional dependencies - equality clauses (AND), possibly IS NULL
+    (b) MCV lists - equality and inequality clauses (AND, OR, NOT), IS [NOT] NULL
 Currently, only OpExprs in the form Var op Const, or Const op Var are
 supported, however it's feasible to expand the code later to also estimate the
 selectivities on clauses such as Var op Var.

--- a/src/backend/statistics/README.mcv
+++ b/src/backend/statistics/README.mcv
+MCV lists
+=========
+Multivariate MCV (most-common values) lists are a straightforward extension of
+regular MCV list, tracking most frequent combinations of values for a group of
+attributes.
+This works particularly well for columns with a small number of distinct values,
+as the list may include all the combinations and approximate the distribution
+very accurately.
+For columns with a large number of distinct values (e.g. those with continuous
+domains), the list will only track the most frequent combinations. If the
+distribution is mostly uniform (all combinations about equally frequent), the
+MCV list will be empty.
+Estimates of some clauses (e.g. equality) based on MCV lists are more accurate
+than when using histograms.
+Also, MCV lists don't necessarily require sorting of the values (the fact that
+we use sorting when building them is implementation detail), but even more
+importantly the ordering is not built into the approximation (while histograms
+are built on ordering). So MCV lists work well even for attributes where the
+ordering of the data type is disconnected from the meaning of the data. For
+example we know how to sort strings, but it's unlikely to make much sense for
+city names (or other label-like attributes).
+Selectivity estimation
+----------------------
+The estimation, implemented in mcv_clauselist_selectivity(), is quite simple
+in principle - we need to identify MCV items matching all the clauses and sum
+frequencies of all those items.
+Currently MCV lists support estimation of the following clause types:
+    (a) equality clauses      WHERE (a = 1) AND (b = 2)
+    (b) inequality clauses    WHERE (a < 1) AND (b >= 2)
+    (c) NULL clauses          WHERE (a IS NULL) AND (b IS NOT NULL)
+    (d) OR clauses            WHERE (a < 1) OR (b >= 2)
+It's possible to add support for additional clauses, for example:
+    (e) multi-var clauses     WHERE (a > b)
+and possibly others. These are tasks for the future, not yet implemented.
+Hashed MCV (not yet implemented)
+--------------------------------
+Regular MCV lists have to include actual values for each item, so if those items
+are large the list may be quite large. This is especially true for multivariate
+MCV lists, although the current implementation partially mitigates this by
+performing de-duplicating the values before storing them on disk.
+It's possible to only store hashes (32-bit values) instead of the actual values,
+significantly reducing the space requirements. Obviously, this would only make
+the MCV lists useful for estimating equality conditions (assuming the 32-bit
+hashes make the collisions rare enough).
+This might also complicate matching the columns to available stats.
+TODO Consider implementing hashed MCV list, storing just 32-bit hashes instead
+     of the actual values. This type of MCV list will be useful only for
+     estimating equality clauses, and will reduce space requirements for large
+     varlena types (in such cases we usually only want equality anyway).
+Inspecting the MCV list
+-----------------------
+Inspecting the regular (per-attribute) MCV lists is trivial, as it's enough
+to select the columns from pg_stats. The data is encoded as anyarrays, and
+all the items have the same data type, so anyarray provides a simple way to
+get a text representation.
+With multivariate MCV lists the columns may use different data types, making
+it impossible to use anyarrays. It might be possible to produce a similar
+array-like representation, but that would complicate further processing and
+analysis of the MCV list.
+So instead the MCV lists are stored in a custom data type (pg_mcv_list),
+which however makes it more difficult to inspect the contents. To make that
+easier, there's a SRF returning detailed information about the MCV lists.
+    SELECT m.* FROM pg_statistic_ext,
+                    pg_mcv_list_items(stxmcv) m WHERE stxname = 'stts2';
+It accepts one parameter - a pg_mcv_list value (which can only be obtained
+from pg_statistic_ext catalog, to defend against malicious input), and
+returns these columns:
+    - item index (0, ..., (nitems-1))
+    - values (string array)
+    - nulls only (boolean array)
+    - frequency (double precision)
+    - base_frequency (double precision)
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -202,13 +202,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 				  VacAttrStats **stats, Bitmapset *attrs)
 {
 	int			i,
-				j;
+				nitems;
-	int			nvalues = numrows * k;
 	MultiSortSupport mss;
 	SortItem   *items;
-	Datum	   *values;
+	AttrNumber *attnums;
-	bool	   *isnull;
+	AttrNumber *attnums_dep;
-	int		   *attnums;
+	int			numattrs;
 	/* counters valid within a group */
 	int			group_size = 0;
@@ -223,26 +222,16 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 	/* sort info for all attributes columns */
 	mss = multi_sort_init(k);
-	/* data for the sort */
-	items = (SortItem *) palloc(numrows * sizeof(SortItem));
-	values = (Datum *) palloc(sizeof(Datum) * nvalues);
-	isnull = (bool *) palloc(sizeof(bool) * nvalues);
-	/* fix the pointers to values/isnull */
-	for (i = 0; i < numrows; i++)
-	{
-		items[i].values = &values[i * k];
-		items[i].isnull = &isnull[i * k];
-	}
 	/*
-	 * Transform the bms into an array, to make accessing i-th member easier.
+	 * Transform the attrs from bitmap to an array to make accessing the i-th
+	 * member easier, and then construct a filtered version with only attnums
+	 * referenced by the dependency we validate.
 	 */
-	attnums = (int *) palloc(sizeof(int) * bms_num_members(attrs));
+	attnums = build_attnums_array(attrs, &numattrs);
-	i = 0;
-	j = -1;
+	attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
-	while ((j = bms_next_member(attrs, j)) >= 0)
+	for (i = 0; i < k; i++)
-		attnums[i++] = j;
+		attnums_dep[i] = attnums[dependency[i]];
 	/*
 	 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
@@ -257,7 +246,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 	 * perhaps at some point it'd be worth using column-specific collations?
 	 */
-	/* prepare the sort function for the first dimension, and SortItem array */
+	/* prepare the sort function for the dimensions */
 	for (i = 0; i < k; i++)
 	{
 		VacAttrStats *colstat = stats[dependency[i]];
@@ -270,19 +259,17 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 		/* prepare the sort function for this dimension */
 		multi_sort_add_dimension(mss, i, type->lt_opr, type->typcollation);
-		/* accumulate all the data for both columns into an array and sort it */
-		for (j = 0; j < numrows; j++)
-		{
-			items[j].values[i] =
-				heap_getattr(rows[j], attnums[dependency[i]],
-							 stats[i]->tupDesc, &items[j].isnull[i]);
-		}
 	}
-	/* sort the items so that we can detect the groups */
+	/*
-	qsort_arg((void *) items, numrows, sizeof(SortItem),
+	 * build an array of SortItem(s) sorted using the multi-sort support
-			  multi_sort_compare, mss);
+	 *
+	 * XXX This relies on all stats entries pointing to the same tuple
+	 * descriptor.  For now that assumption holds, but it might change in
+	 * the future for example if we support statistics on multiple tables.
+	 */
+	items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
+							   mss, k, attnums_dep);
 	/*
 	 * Walk through the sorted array, split it into rows according to the
@@ -295,14 +282,14 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 	group_size = 1;
 	/* loop 1 beyond the end of the array so that we count the final group */
-	for (i = 1; i <= numrows; i++)
+	for (i = 1; i <= nitems; i++)
 	{
 		/*
 		 * Check if the group ended, which may be either because we processed
-		 * all the items (i==numrows), or because the i-th item is not equal
+		 * all the items (i==nitems), or because the i-th item is not equal
 		 * to the preceding one.
 		 */
-		if (i == numrows ||
+		if (i == nitems ||
 			multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0)
 		{
 			/*
@@ -324,10 +311,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 		group_size++;
 	}
+	if (items)
 		pfree(items);
-	pfree(values);
-	pfree(isnull);
 	pfree(mss);
+	pfree(attnums);
+	pfree(attnums_dep);
 	/* Compute the 'degree of validity' as (supporting/total). */
 	return (n_supporting_rows * 1.0 / numrows);
@@ -354,24 +343,17 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
 						   VacAttrStats **stats)
 {
 	int			i,
-				j,
 				k;
 	int			numattrs;
-	int		   *attnums;
+	AttrNumber *attnums;
 	/* result */
 	MVDependencies *dependencies = NULL;
-	numattrs = bms_num_members(attrs);
 	/*
 	 * Transform the bms into an array, to make accessing i-th member easier.
 	 */
-	attnums = palloc(sizeof(int) * bms_num_members(attrs));
+	attnums = build_attnums_array(attrs, &numattrs);
-	i = 0;
-	j = -1;
-	while ((j = bms_next_member(attrs, j)) >= 0)
-		attnums[i++] = j;
 	Assert(numattrs >= 2);
@@ -918,9 +900,9 @@ find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies,
 *		using functional dependency statistics, or 1.0 if no useful functional
 *		dependency statistic exists.
 *
- * 'estimatedclauses' is an output argument that gets a bit set corresponding
+ * 'estimatedclauses' is an input/output argument that gets a bit set
- * to the (zero-based) list index of each clause that is included in the
+ * corresponding to the (zero-based) list index of each clause that is included
- * estimated selectivity.
+ * in the estimated selectivity.
 *
 * Given equality clauses on attributes (a,b) we find the strongest dependency
 * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
@@ -955,9 +937,6 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 	AttrNumber *list_attnums;
 	int			listidx;
-	/* initialize output argument */
-	*estimatedclauses = NULL;
 	/* check if there's any stats that might be useful for us. */
 	if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
 		return 1.0;
@@ -972,6 +951,9 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 	 * the attnums for each clause in a list which we'll reference later so we
 	 * don't need to repeat the same work again. We'll also keep track of all
 	 * attnums seen.
+	 *
+	 * We also skip clauses that we already estimated using different types of
+	 * statistics (we treat them as incompatible).
 	 */
 	listidx = 0;
 	foreach(l, clauses)
@@ -979,7 +961,8 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 		Node	   *clause = (Node *) lfirst(l);
 		AttrNumber	attnum;
-		if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+		if (!bms_is_member(listidx, *estimatedclauses) &&
+			dependency_is_compatible_clause(clause, rel->relid, &attnum))
 		{
 			list_attnums[listidx] = attnum;
 			clauses_attnums = bms_add_member(clauses_attnums, attnum);
@@ -1049,8 +1032,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 			/*
 			 * Skip incompatible clauses, and ones we've already estimated on.
 			 */
-			if (list_attnums[listidx] == InvalidAttrNumber ||
+			if (list_attnums[listidx] == InvalidAttrNumber)
-				bms_is_member(listidx, *estimatedclauses))
 				continue;
 			/*

--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -1509,6 +1509,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 	bool		isnull;
 	bool		ndistinct_enabled;
 	bool		dependencies_enabled;
+	bool		mcv_enabled;
 	int			i;
 	statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
@@ -1544,6 +1545,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 	ndistinct_enabled = false;
 	dependencies_enabled = false;
+	mcv_enabled = false;
 	for (i = 0; i < ARR_DIMS(arr)[0]; i++)
 	{
@@ -1551,6 +1553,8 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 			ndistinct_enabled = true;
 		if (enabled[i] == STATS_EXT_DEPENDENCIES)
 			dependencies_enabled = true;
+		if (enabled[i] == STATS_EXT_MCV)
+			mcv_enabled = true;
 	}
 	/*
@@ -1560,13 +1564,27 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 	 * statistics types on a newer postgres version, if the statistics had all
 	 * options enabled on the original version.
 	 */
-	if (!ndistinct_enabled || !dependencies_enabled)
+	if (!ndistinct_enabled || !dependencies_enabled || !mcv_enabled)
 	{
+		bool	gotone = false;
 		appendStringInfoString(&buf, " (");
 		if (ndistinct_enabled)
+		{
 			appendStringInfoString(&buf, "ndistinct");
-		else if (dependencies_enabled)
+			gotone = true;
-			appendStringInfoString(&buf, "dependencies");
+		}
+		if (dependencies_enabled)
+		{
+			appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
+			gotone = true;
+		}
+		if (mcv_enabled)
+			appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
 		appendStringInfoChar(&buf, ')');
 	}

--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2629,7 +2629,8 @@ describeOneTableDetails(const char *schemaname,
 							  "   JOIN pg_catalog.pg_attribute a ON (stxrelid = a.attrelid AND\n"
 							  "        a.attnum = s.attnum AND NOT attisdropped)) AS columns,\n"
 							  "  'd' = any(stxkind) AS ndist_enabled,\n"
-							  "  'f' = any(stxkind) AS deps_enabled\n"
+							  "  'f' = any(stxkind) AS deps_enabled,\n"
+							  "  'm' = any(stxkind) AS mcv_enabled\n"
 							  "FROM pg_catalog.pg_statistic_ext stat "
 							  "WHERE stxrelid = '%s'\n"
 							  "ORDER BY 1;",
@@ -2666,6 +2667,12 @@ describeOneTableDetails(const char *schemaname,
 					if (strcmp(PQgetvalue(result, i, 6), "t") == 0)
 					{
 						appendPQExpBuffer(&buf, "%sdependencies", gotone ? ", " : "");
+						gotone = true;
+					}
+					if (strcmp(PQgetvalue(result, i, 7), "t") == 0)
+					{
+						appendPQExpBuffer(&buf, "%smcv", gotone ? ", " : "");
 					}
 					appendPQExpBuffer(&buf, ") ON %s FROM %s",

--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2432,7 +2432,7 @@ psql_completion(const char *text, int start, int end)
 	else if (Matches("CREATE", "STATISTICS", MatchAny))
 		COMPLETE_WITH("(", "ON");
 	else if (Matches("CREATE", "STATISTICS", MatchAny, "("))
-		COMPLETE_WITH("ndistinct", "dependencies");
+		COMPLETE_WITH("ndistinct", "dependencies", "mcv");
 	else if (Matches("CREATE", "STATISTICS", MatchAny, "(*)"))
 		COMPLETE_WITH("ON");
 	else if (HeadMatches("CREATE", "STATISTICS", MatchAny) &&

--- a/src/include/catalog/pg_cast.dat
+++ b/src/include/catalog/pg_cast.dat
@@ -324,6 +324,12 @@
 { castsource => 'pg_dependencies', casttarget => 'text', castfunc => '0',
  castcontext => 'i', castmethod => 'i' },
+# pg_mcv_list can be coerced to, but not from, bytea and text
+{ castsource => 'pg_mcv_list', casttarget => 'bytea', castfunc => '0',
+  castcontext => 'i', castmethod => 'b' },
+{ castsource => 'pg_mcv_list', casttarget => 'text', castfunc => '0',
+  castcontext => 'i', castmethod => 'i' },
 # Datetime category
 { castsource => 'date', casttarget => 'timestamp',
  castfunc => 'timestamp(date)', castcontext => 'i', castmethod => 'f' },

--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -4999,6 +4999,30 @@
  proname => 'pg_dependencies_send', provolatile => 's', prorettype => 'bytea',
  proargtypes => 'pg_dependencies', prosrc => 'pg_dependencies_send' },
+{ oid => '5018', descr => 'I/O',
+  proname => 'pg_mcv_list_in', prorettype => 'pg_mcv_list',
+  proargtypes => 'cstring', prosrc => 'pg_mcv_list_in' },
+{ oid => '5019', descr => 'I/O',
+  proname => 'pg_mcv_list_out', prorettype => 'cstring',
+  proargtypes => 'pg_mcv_list', prosrc => 'pg_mcv_list_out' },
+{ oid => '5020', descr => 'I/O',
+  proname => 'pg_mcv_list_recv', provolatile => 's',
+  prorettype => 'pg_mcv_list', proargtypes => 'internal',
+  prosrc => 'pg_mcv_list_recv' },
+{ oid => '5021', descr => 'I/O',
+  proname => 'pg_mcv_list_send', provolatile => 's', prorettype => 'bytea',
+  proargtypes => 'pg_mcv_list', prosrc => 'pg_mcv_list_send' },
+{ oid => '3427',
+  descr => 'details about MCV list items',
+  proname => 'pg_mcv_list_items', prorows => '1000', proisstrict => 't',
+  proretset => 't', provolatile => 's', prorettype => 'record',
+  proargtypes => 'pg_mcv_list',
+  proallargtypes => '{pg_mcv_list,int4,text,_bool,float8,float8}',
+  proargmodes => '{i,o,o,o,o,o}',
+  proargnames => '{mcv_list,index,values,nulls,frequency,base_frequency}',
+  prosrc => 'pg_stats_ext_mcvlist_items' },
 { oid => '1928', descr => 'statistics: number of scans done for table/index',
  proname => 'pg_stat_get_numscans', provolatile => 's', proparallel => 'r',
  prorettype => 'int8', proargtypes => 'oid',

--- a/src/include/catalog/pg_statistic_ext.h
+++ b/src/include/catalog/pg_statistic_ext.h
@@ -49,6 +49,7 @@ CATALOG(pg_statistic_ext,3381,StatisticExtRelationId)
 												 * to build */
 	pg_ndistinct stxndistinct;	/* ndistinct coefficients (serialized) */
 	pg_dependencies stxdependencies;	/* dependencies (serialized) */
+	pg_mcv_list stxmcv;			/* MCV (serialized) */
 #endif
 } FormData_pg_statistic_ext;
@@ -64,6 +65,7 @@ typedef FormData_pg_statistic_ext *Form_pg_statistic_ext;
 #define STATS_EXT_NDISTINCT			'd'
 #define STATS_EXT_DEPENDENCIES		'f'
+#define STATS_EXT_MCV				'm'
 #endif							/* EXPOSE_TO_CLIENT_CODE */

--- a/src/include/catalog/pg_type.dat
+++ b/src/include/catalog/pg_type.dat
@@ -165,6 +165,13 @@
  typoutput => 'pg_dependencies_out', typreceive => 'pg_dependencies_recv',
  typsend => 'pg_dependencies_send', typalign => 'i', typstorage => 'x',
  typcollation => 'default' },
+{ oid => '5017', oid_symbol => 'PGMCVLISTOID',
+  descr => 'multivariate MCV list',
+  typname => 'pg_mcv_list', typlen => '-1', typbyval => 'f',
+  typcategory => 'S', typinput => 'pg_mcv_list_in',
+  typoutput => 'pg_mcv_list_out', typreceive => 'pg_mcv_list_recv',
+  typsend => 'pg_mcv_list_send', typalign => 'i', typstorage => 'x',
+  typcollation => 'default' },
 { oid => '32', oid_symbol => 'PGDDLCOMMANDOID',
  descr => 'internal type for passing CollectedCommand',
  typname => 'pg_ddl_command', typlen => 'SIZEOF_POINTER', typbyval => 't',

--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -87,6 +87,7 @@ extern Bitmapset *bms_difference(const Bitmapset *a, const Bitmapset *b);
 extern bool bms_is_subset(const Bitmapset *a, const Bitmapset *b);
 extern BMS_Comparison bms_subset_compare(const Bitmapset *a, const Bitmapset *b);
 extern bool bms_is_member(int x, const Bitmapset *a);
+extern int	bms_member_index(Bitmapset *a, int x);
 extern bool bms_overlap(const Bitmapset *a, const Bitmapset *b);
 extern bool bms_overlap_list(const Bitmapset *a, const struct List *b);
 extern bool bms_nonempty_difference(const Bitmapset *a, const Bitmapset *b);

--- a/src/include/optimizer/optimizer.h
+++ b/src/include/optimizer/optimizer.h
@@ -58,6 +58,12 @@ extern Selectivity clause_selectivity(PlannerInfo *root,
 				   int varRelid,
 				   JoinType jointype,
 				   SpecialJoinInfo *sjinfo);
+extern Selectivity clauselist_selectivity_simple(PlannerInfo *root,
+							  List *clauses,
+							  int varRelid,
+							  JoinType jointype,
+							  SpecialJoinInfo *sjinfo,
+							  Bitmapset *estimatedclauses);
 extern Selectivity clauselist_selectivity(PlannerInfo *root,
 					   List *clauses,
 					   int varRelid,

--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -31,6 +31,15 @@ typedef struct
 	int			tupno;			/* position index for tuple it came from */
 } ScalarItem;
+/* (de)serialization info */
+typedef struct DimensionInfo
+{
+	int			nvalues;		/* number of deduplicated values */
+	int			nbytes;			/* number of bytes (serialized) */
+	int			typlen;			/* pg_type.typlen */
+	bool		typbyval;		/* pg_type.typbyval */
+} DimensionInfo;
 /* multi-sort */
 typedef struct MultiSortSupportData
 {
@@ -44,6 +53,7 @@ typedef struct SortItem
 {
 	Datum	   *values;
 	bool	   *isnull;
+	int			count;
 } SortItem;
 extern MVNDistinct *statext_ndistinct_build(double totalrows,
@@ -57,6 +67,12 @@ extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
 extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
 extern MVDependencies *statext_dependencies_deserialize(bytea *data);
+extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows,
+				  Bitmapset *attrs, VacAttrStats **stats,
+				  double totalrows);
+extern bytea *statext_mcv_serialize(MCVList * mcv, VacAttrStats **stats);
+extern MCVList * statext_mcv_deserialize(bytea *data);
 extern MultiSortSupport multi_sort_init(int ndims);
 extern void multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
 						 Oid oper, Oid collation);
@@ -65,5 +81,29 @@ extern int multi_sort_compare_dim(int dim, const SortItem *a,
 					   const SortItem *b, MultiSortSupport mss);
 extern int multi_sort_compare_dims(int start, int end, const SortItem *a,
 						const SortItem *b, MultiSortSupport mss);
+extern int	compare_scalars_simple(const void *a, const void *b, void *arg);
+extern int	compare_datums_simple(Datum a, Datum b, SortSupport ssup);
+extern void *bsearch_arg(const void *key, const void *base,
+			size_t nmemb, size_t size,
+			int (*compar) (const void *, const void *, void *),
+			void *arg);
+extern AttrNumber *build_attnums_array(Bitmapset *attrs, int *numattrs);
+extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows,
+				   TupleDesc tdesc, MultiSortSupport mss,
+				   int numattrs, AttrNumber *attnums);
+extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
+						   StatisticExtInfo *stat,
+						   List *clauses,
+						   int varRelid,
+						   JoinType jointype,
+						   SpecialJoinInfo *sjinfo,
+						   RelOptInfo *rel,
+						   Selectivity *basesel,
+						   Selectivity *totalsel);
 #endif							/* EXTENDED_STATS_INTERNAL_H */
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -78,8 +78,41 @@ typedef struct MVDependencies
 /* size of the struct excluding the deps array */
 #define SizeOfDependencies	(offsetof(MVDependencies, ndeps) + sizeof(uint32))
+/* used to flag stats serialized to bytea */
+#define STATS_MCV_MAGIC			0xE1A651C2	/* marks serialized bytea */
+#define STATS_MCV_TYPE_BASIC	1	/* basic MCV list type */
+/* max items in MCV list (mostly arbitrary number) */
+#define STATS_MCVLIST_MAX_ITEMS        8192
+/*
+ * Multivariate MCV (most-common value) lists
+ *
+ * A straightforward extension of MCV items - i.e. a list (array) of
+ * combinations of attribute values, together with a frequency and null flags.
+ */
+typedef struct MCVItem
+{
+	double		frequency;		/* frequency of this combination */
+	double		base_frequency;	/* frequency if independent */
+	bool	   *isnull;			/* NULL flags */
+	Datum	   *values;			/* item values */
+} MCVItem;
+/* multivariate MCV list - essentally an array of MCV items */
+typedef struct MCVList
+{
+	uint32		magic;			/* magic constant marker */
+	uint32		type;			/* type of MCV list (BASIC) */
+	uint32		nitems;			/* number of MCV items in the array */
+	AttrNumber	ndimensions;	/* number of dimensions */
+	Oid			types[STATS_MAX_DIMENSIONS];	/* OIDs of data types */
+	MCVItem   **items;			/* array of MCV items */
+} MCVList;
 extern MVNDistinct *statext_ndistinct_load(Oid mvoid);
 extern MVDependencies *statext_dependencies_load(Oid mvoid);
+extern MCVList *statext_mcv_load(Oid mvoid);
 extern void BuildRelationExtStatistics(Relation onerel, double totalrows,
 						   int numrows, HeapTuple *rows,
@@ -92,6 +125,13 @@ extern Selectivity dependencies_clauselist_selectivity(PlannerInfo *root,
 									SpecialJoinInfo *sjinfo,
 									RelOptInfo *rel,
 									Bitmapset **estimatedclauses);
+extern Selectivity statext_clauselist_selectivity(PlannerInfo *root,
+							   List *clauses,
+							   int varRelid,
+							   JoinType jointype,
+							   SpecialJoinInfo *sjinfo,
+							   RelOptInfo *rel,
+							   Bitmapset **estimatedclauses);
 extern bool has_stats_of_kind(List *stats, char requiredkind);
 extern StatisticExtInfo *choose_best_statistics(List *stats,
 					   Bitmapset *attnums, char requiredkind);

--- a/src/test/regress/expected/create_table_like.out
+++ b/src/test/regress/expected/create_table_like.out
@@ -243,7 +243,7 @@ Indexes:
 Check constraints:
    "ctlt1_a_check" CHECK (length(a) > 2)
 Statistics objects:
-    "public"."ctlt_all_a_b_stat" (ndistinct, dependencies) ON a, b FROM ctlt_all
+    "public"."ctlt_all_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt_all
 SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_class c WHERE classoid = 'pg_class'::regclass AND objoid = i.indexrelid AND c.oid = i.indexrelid AND i.indrelid = 'ctlt_all'::regclass ORDER BY c.relname, objsubid;
    relname     | objsubid | description 

--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -892,11 +892,12 @@ WHERE c.castmethod = 'b' AND
 pg_node_tree      | text              |        0 | i
 pg_ndistinct      | bytea             |        0 | i
 pg_dependencies   | bytea             |        0 | i
+ pg_mcv_list       | bytea             |        0 | i
 cidr              | inet              |        0 | i
 xml               | text              |        0 | a
 xml               | character varying |        0 | a
 xml               | character         |        0 | a
-(9 rows)
+(10 rows)
 -- **************** pg_conversion ****************
 -- Look for illegal values in pg_conversion fields.

--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -58,7 +58,7 @@ ALTER TABLE ab1 DROP COLUMN a;
 b      | integer |           |          | 
 c      | integer |           |          | 
 Statistics objects:
-    "public"."ab1_b_c_stats" (ndistinct, dependencies) ON b, c FROM ab1
+    "public"."ab1_b_c_stats" (ndistinct, dependencies, mcv) ON b, c FROM ab1
 -- Ensure statistics are dropped when table is
 SELECT stxname FROM pg_statistic_ext WHERE stxname LIKE 'ab1%';
@@ -211,7 +211,7 @@ SELECT stxkind, stxndistinct
  FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
 stxkind |                      stxndistinct                       
 ---------+---------------------------------------------------------
- {d,f}   | {"3, 4": 301, "3, 6": 301, "4, 6": 301, "3, 4, 6": 301}
+ {d,f,m} | {"3, 4": 301, "3, 6": 301, "4, 6": 301, "3, 4, 6": 301}
 (1 row)
 -- Hash Aggregate, thanks to estimates improved by the statistic
@@ -277,7 +277,7 @@ SELECT stxkind, stxndistinct
  FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
 stxkind |                        stxndistinct                         
 ---------+-------------------------------------------------------------
- {d,f}   | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 10000}
+ {d,f,m} | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 10000}
 (1 row)
 -- plans using Group Aggregate, thanks to using correct esimates
@@ -514,3 +514,284 @@ EXPLAIN (COSTS OFF)
 (5 rows)
 RESET random_page_cost;
+-- check the number of estimated/actual rows in the top node
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
+-- MCV lists
+CREATE TABLE mcv_lists (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b VARCHAR,
+    filler3 DATE,
+    c INT,
+    d TEXT
+);
+-- random data (no MCV list)
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,37), mod(i,41), mod(i,43), mod(i,47) FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         3 |      4
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         3 |      4
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+-- 100 distinct combinations, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 1 AND b < ''1''');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 0 AND b <= ''0''');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND b < ''1'' AND c < 5');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 4 AND b <= ''0'' AND c <= 4');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 1 AND b < ''1''');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 0 AND b <= ''0''');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND b < ''1'' AND c < 5');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 4 AND b <= ''0'' AND c <= 4');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+-- check change of unrelated column type does not reset the MCV statistics
+ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64);
+SELECT stxmcv IS NOT NULL FROM pg_statistic_ext WHERE stxname = 'mcv_lists_stats';
+ ?column? 
+----------
+ t
+(1 row)
+-- check change of column type resets the MCV statistics
+ALTER TABLE mcv_lists ALTER COLUMN c TYPE numeric;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+-- 100 distinct combinations with NULL values, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT
+         (CASE WHEN mod(i,100) = 1 THEN NULL ELSE mod(i,100) END),
+         (CASE WHEN mod(i,50) = 1  THEN NULL ELSE mod(i,50) END),
+         (CASE WHEN mod(i,25) = 1  THEN NULL ELSE mod(i,25) END),
+         i
+     FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+RESET random_page_cost;
+-- mcv with arrays
+CREATE TABLE mcv_lists_arrays (
+    a TEXT[],
+    b NUMERIC[],
+    c INT[]
+);
+INSERT INTO mcv_lists_arrays (a, b, c)
+     SELECT
+         ARRAY[md5((i/100)::text), md5((i/100-1)::text), md5((i/100+1)::text)],
+         ARRAY[(i/100-1)::numeric/1000, (i/100)::numeric/1000, (i/100+1)::numeric/1000],
+         ARRAY[(i/100-1), i/100, (i/100+1)]
+     FROM generate_series(1,5000) s(i);
+CREATE STATISTICS mcv_lists_arrays_stats (mcv) ON a, b, c
+  FROM mcv_lists_arrays;
+ANALYZE mcv_lists_arrays;
+-- mcv with bool
+CREATE TABLE mcv_lists_bool (
+    a BOOL,
+    b BOOL,
+    c BOOL
+);
+INSERT INTO mcv_lists_bool (a, b, c)
+     SELECT
+         (mod(i,2) = 0), (mod(i,4) = 0), (mod(i,8) = 0)
+     FROM generate_series(1,10000) s(i);
+ANALYZE mcv_lists_bool;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE a AND b AND c');
+ estimated | actual 
+-----------+--------
+       156 |   1250
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND c');
+ estimated | actual 
+-----------+--------
+       156 |      0
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND NOT b AND c');
+ estimated | actual 
+-----------+--------
+       469 |      0
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
+ estimated | actual 
+-----------+--------
+      1094 |      0
+(1 row)
+CREATE STATISTICS mcv_lists_bool_stats (mcv) ON a, b, c
+  FROM mcv_lists_bool;
+ANALYZE mcv_lists_bool;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE a AND b AND c');
+ estimated | actual 
+-----------+--------
+      1250 |   1250
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND c');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND NOT b AND c');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
--- a/src/test/regress/expected/type_sanity.out
+++ b/src/test/regress/expected/type_sanity.out
@@ -72,7 +72,8 @@ WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT LIKE E'\\_%'
  194 | pg_node_tree
 3361 | pg_ndistinct
 3402 | pg_dependencies
-(3 rows)
+ 5017 | pg_mcv_list
+(4 rows)
 -- Make sure typarray points to a varlena array type of our own base
 SELECT p1.oid, p1.typname as basetype, p2.typname as arraytype,

--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -280,3 +280,189 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
 RESET random_page_cost;
+-- check the number of estimated/actual rows in the top node
+create function check_estimated_rows(text) returns table (estimated int, actual int)
+language plpgsql as
+$$
+declare
+    ln text;
+    tmp text[];
+    first_row bool := true;
+begin
+    for ln in
+        execute format('explain analyze %s', $1)
+    loop
+        if first_row then
+            first_row := false;
+            tmp := regexp_match(ln, 'rows=(\d*) .* rows=(\d*)');
+            return query select tmp[1]::int, tmp[2]::int;
+        end if;
+    end loop;
+end;
+$$;
+-- MCV lists
+CREATE TABLE mcv_lists (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b VARCHAR,
+    filler3 DATE,
+    c INT,
+    d TEXT
+);
+-- random data (no MCV list)
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,37), mod(i,41), mod(i,43), mod(i,47) FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+-- 100 distinct combinations, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 1 AND b < ''1''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 0 AND b <= ''0''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND b < ''1'' AND c < 5');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 4 AND b <= ''0'' AND c <= 4');
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 1 AND b < ''1''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 0 AND b <= ''0''');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < 5 AND b < ''1'' AND c < 5');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 4 AND b <= ''0'' AND c <= 4');
+-- check change of unrelated column type does not reset the MCV statistics
+ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64);
+SELECT stxmcv IS NOT NULL FROM pg_statistic_ext WHERE stxname = 'mcv_lists_stats';
+-- check change of column type resets the MCV statistics
+ALTER TABLE mcv_lists ALTER COLUMN c TYPE numeric;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+-- 100 distinct combinations with NULL values, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT
+         (CASE WHEN mod(i,100) = 1 THEN NULL ELSE mod(i,100) END),
+         (CASE WHEN mod(i,50) = 1  THEN NULL ELSE mod(i,50) END),
+         (CASE WHEN mod(i,25) = 1  THEN NULL ELSE mod(i,25) END),
+         i
+     FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL');
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL');
+RESET random_page_cost;
+-- mcv with arrays
+CREATE TABLE mcv_lists_arrays (
+    a TEXT[],
+    b NUMERIC[],
+    c INT[]
+);
+INSERT INTO mcv_lists_arrays (a, b, c)
+     SELECT
+         ARRAY[md5((i/100)::text), md5((i/100-1)::text), md5((i/100+1)::text)],
+         ARRAY[(i/100-1)::numeric/1000, (i/100)::numeric/1000, (i/100+1)::numeric/1000],
+         ARRAY[(i/100-1), i/100, (i/100+1)]
+     FROM generate_series(1,5000) s(i);
+CREATE STATISTICS mcv_lists_arrays_stats (mcv) ON a, b, c
+  FROM mcv_lists_arrays;
+ANALYZE mcv_lists_arrays;
+-- mcv with bool
+CREATE TABLE mcv_lists_bool (
+    a BOOL,
+    b BOOL,
+    c BOOL
+);
+INSERT INTO mcv_lists_bool (a, b, c)
+     SELECT
+         (mod(i,2) = 0), (mod(i,4) = 0), (mod(i,8) = 0)
+     FROM generate_series(1,10000) s(i);
+ANALYZE mcv_lists_bool;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE a AND b AND c');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND c');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND NOT b AND c');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
+CREATE STATISTICS mcv_lists_bool_stats (mcv) ON a, b, c
+  FROM mcv_lists_bool;
+ANALYZE mcv_lists_bool;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE a AND b AND c');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND c');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND NOT b AND c');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');