Create a type-specific typanalyze routine for tsvector, which collects stats

on the most common individual lexemes in place of the mostly-useless default behavior of counting duplicate tsvectors. Future work: create selectivity estimation functions that actually do something with these stats. (Some other things we ought to look at doing: using the Lossy Counting algorithm in compute_minimal_stats, and using the element-counting idea for stats on regular arrays.) Jan Urbanski

Create a type-specific typanalyze routine for tsvector, which collects stats
on the most common individual lexemes in place of the mostly-useless default behavior of counting duplicate tsvectors. Future work: create selectivity estimation functions that actually do something with these stats. (Some other things we ought to look at doing: using the Lossy Counting algorithm in compute_minimal_stats, and using the element-counting idea for stats on regular arrays.) Jan Urbanski
6f6d8632 · Tom Lane · 6816577a · 6f6d8632 · 6f6d8632 · 6f6d8632
Commit 6f6d8632 authored Jul 14, 2008 by Tom Lane
11 changed files
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.167 2008/07/11 07:02:43 petere Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.168 2008/07/14 00:51:45 tgl Exp $ -->
 <!--
 Documentation of the system catalogs, directed toward PostgreSQL developers
 -->
@@ -6516,6 +6516,8 @@
      <entry>
       A list of the most common values in the column. (NULL if
       no values seem to be more common than any others.)
+       For some datatypes such as <type>tsvector</>, this is a list of
+       the most common element values rather than values of the type itself.
      </entry>
     </row>

@@ -6524,10 +6526,10 @@
      <entry><type>real[]</type></entry>
      <entry></entry>
      <entry>
-       A list of the frequencies of the most common values,
+       A list of the frequencies of the most common values or elements,
       i.e., number of occurrences of each divided by total number of rows.
       (NULL when <structfield>most_common_vals</structfield> is.)
-     </entry>
+      </entry>
     </row>

     <row>

--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -3,7 +3,7 @@
 *
 * Copyright (c) 1996-2008, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.52 2008/05/15 00:17:39 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.53 2008/07/14 00:51:45 tgl Exp $
 */

 CREATE VIEW pg_roles AS 
@@ -110,30 +110,30 @@ CREATE VIEW pg_stats AS
        stanullfrac AS null_frac, 
        stawidth AS avg_width, 
        stadistinct AS n_distinct, 
-        CASE 1 
-            WHEN stakind1 THEN stavalues1 
-            WHEN stakind2 THEN stavalues2 
-            WHEN stakind3 THEN stavalues3 
-            WHEN stakind4 THEN stavalues4 
-        END AS most_common_vals, 
-        CASE 1 
-            WHEN stakind1 THEN stanumbers1 
-            WHEN stakind2 THEN stanumbers2 
-            WHEN stakind3 THEN stanumbers3 
-            WHEN stakind4 THEN stanumbers4 
-        END AS most_common_freqs, 
-        CASE 2 
-            WHEN stakind1 THEN stavalues1 
-            WHEN stakind2 THEN stavalues2 
-            WHEN stakind3 THEN stavalues3 
-            WHEN stakind4 THEN stavalues4 
-        END AS histogram_bounds, 
-        CASE 3 
-            WHEN stakind1 THEN stanumbers1[1] 
-            WHEN stakind2 THEN stanumbers2[1] 
-            WHEN stakind3 THEN stanumbers3[1] 
-            WHEN stakind4 THEN stanumbers4[1] 
-        END AS correlation 
+        CASE
+            WHEN stakind1 IN (1, 4) THEN stavalues1
+            WHEN stakind2 IN (1, 4) THEN stavalues2
+            WHEN stakind3 IN (1, 4) THEN stavalues3
+            WHEN stakind4 IN (1, 4) THEN stavalues4
+        END AS most_common_vals,
+        CASE
+            WHEN stakind1 IN (1, 4) THEN stanumbers1
+            WHEN stakind2 IN (1, 4) THEN stanumbers2
+            WHEN stakind3 IN (1, 4) THEN stanumbers3
+            WHEN stakind4 IN (1, 4) THEN stanumbers4
+        END AS most_common_freqs,
+        CASE
+            WHEN stakind1 = 2 THEN stavalues1
+            WHEN stakind2 = 2 THEN stavalues2
+            WHEN stakind3 = 2 THEN stavalues3
+            WHEN stakind4 = 2 THEN stavalues4
+        END AS histogram_bounds,
+        CASE
+            WHEN stakind1 = 3 THEN stanumbers1[1]
+            WHEN stakind2 = 3 THEN stanumbers2[1]
+            WHEN stakind3 = 3 THEN stanumbers3[1]
+            WHEN stakind4 = 3 THEN stanumbers4[1]
+        END AS correlation
    FROM pg_statistic s JOIN pg_class c ON (c.oid = s.starelid) 
         JOIN pg_attribute a ON (c.oid = attrelid AND attnum = s.staattnum) 
         LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace) 

--- a/src/backend/tsearch/Makefile
+++ b/src/backend/tsearch/Makefile
@@ -4,7 +4,7 @@
 #
 # Copyright (c) 2006-2008, PostgreSQL Global Development Group
 #
-# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.6 2008/02/19 10:30:08 petere Exp $
+# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.7 2008/07/14 00:51:45 tgl Exp $
 #
 #-------------------------------------------------------------------------
 subdir = src/backend/tsearch
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
 OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
 	dict_simple.o dict_synonym.o dict_thesaurus.o \
 	dict_ispell.o regis.o spell.o \
-	to_tsany.o ts_utils.o
+	to_tsany.o ts_typanalyze.o ts_utils.o

 include $(top_srcdir)/src/backend/common.mk


--- a/src/backend/tsearch/ts_typanalyze.c
+++ b/src/backend/tsearch/ts_typanalyze.c
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.466 2008/07/11 21:06:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.467 2008/07/14 00:51:45 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -53,6 +53,6 @@
 */

 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200807111
+#define CATALOG_VERSION_NO	200807131

 #endif
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_operator.h,v 1.160 2008/06/17 19:10:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_operator.h,v 1.161 2008/07/14 00:51:45 tgl Exp $
 *
 * NOTES
 *	  the genbki.sh script reads this file and generates .bki
@@ -105,6 +105,7 @@ DATA(insert OID =  95 ( "<"		   PGNSP PGUID b f f	21	21	16 520 524 int2lt scalar
 DATA(insert OID =  96 ( "="		   PGNSP PGUID b t t	23	23	16	96 518 int4eq eqsel eqjoinsel ));
 DATA(insert OID =  97 ( "<"		   PGNSP PGUID b f f	23	23	16 521 525 int4lt scalarltsel scalarltjoinsel ));
 DATA(insert OID =  98 ( "="		   PGNSP PGUID b t t	25	25	16	98 531 texteq eqsel eqjoinsel ));
+#define TextEqualOperator   98

 DATA(insert OID = 349 (  "||"	   PGNSP PGUID b f f 2277 2283 2277 0 0 array_append   -	   -	 ));
 DATA(insert OID = 374 (  "||"	   PGNSP PGUID b f f 2283 2277 2277 0 0 array_prepend  -	   -	 ));

--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.504 2008/07/03 20:58:46 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.505 2008/07/14 00:51:45 tgl Exp $
 *
 * NOTES
 *	  The script catalog/genbki.sh reads this file and generates .bki
@@ -4313,6 +4313,9 @@ DESCR("GiST tsquery support");
 DATA(insert OID = 3701 (  gtsquery_consistent			PGNSP PGUID 12 1 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ gtsquery_consistent - _null_ _null_ ));
 DESCR("GiST tsquery support");

+DATA(insert OID = 3688 (  ts_typanalyze	PGNSP PGUID 12 1 0 f f t f s 1 16 "2281" _null_ _null_ _null_ ts_typanalyze - _null_ _null_ ));
+DESCR("tsvector typanalyze");
+
 DATA(insert OID = 3689 (  ts_stat		PGNSP PGUID 12 10 10000 f f t t v 1 2249 "25" "{25,25,23,23}" "{i,o,o,o}" "{query,word,ndoc,nentry}" ts_stat1 - _null_ _null_ ));
 DESCR("statistics of tsvector column");
 DATA(insert OID = 3690 (  ts_stat		PGNSP PGUID 12 10 10000 f f t t v 2 2249 "25 25" "{25,25,25,23,23}" "{i,i,o,o,o}" "{query,weights,word,ndoc,nentry}" ts_stat2 - _null_ _null_ ));

--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_statistic.h,v 1.35 2008/03/27 03:57:34 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_statistic.h,v 1.36 2008/07/14 00:51:45 tgl Exp $
 *
 * NOTES
 *	  the genbki.sh script reads this file and generates .bki
@@ -237,4 +237,19 @@ typedef FormData_pg_statistic *Form_pg_statistic;
 */
 #define STATISTIC_KIND_CORRELATION	3

+/*
+ * A "most common elements" slot is similar to a "most common values" slot,
+ * except that it stores the most common non-null *elements* of the column
+ * values.  This is useful when the column datatype is an array or some other
+ * type with identifiable elements (for instance, tsvector).  staop contains
+ * the equality operator appropriate to the element type.  stavalues contains
+ * the most common element values, and stanumbers their frequencies, with the
+ * same rules as for MCV slots.
+ *
+ * Note: in current usage for tsvector columns, the stavalues elements are of
+ * type text, even though their representation within tsvector is not
+ * exactly text.
+ */
+#define STATISTIC_KIND_MCELEM  4
+
 #endif   /* PG_STATISTIC_H */
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/catalog/pg_type.h,v 1.196 2008/06/24 17:58:27 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_type.h,v 1.197 2008/07/14 00:51:45 tgl Exp $
 *
 * NOTES
 *	  the genbki.sh script reads this file and generates .bki
@@ -543,7 +543,7 @@ DESCR("UUID datatype");
 DATA(insert OID = 2951 ( _uuid			PGNSP PGUID -1 f b t \054 0 2950 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 _null_ _null_ ));

 /* text search */
-DATA(insert OID = 3614 ( tsvector		PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - - i x f 0 -1 0 _null_ _null_ ));
+DATA(insert OID = 3614 ( tsvector		PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - ts_typanalyze i x f 0 -1 0 _null_ _null_ ));
 DESCR("text representation for text search");
 #define TSVECTOROID		3614
 DATA(insert OID = 3642 ( gtsvector		PGNSP PGUID -1 f b t \054 0 0 3644 gtsvectorin gtsvectorout - - - - - i p f 0 -1 0 _null_ _null_ ));

--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -5,7 +5,7 @@
 *
 * Copyright (c) 1998-2008, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.12 2008/06/10 08:55:50 heikki Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.13 2008/07/14 00:51:45 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -153,6 +153,8 @@ extern Datum ts_rankcd_wtt(PG_FUNCTION_ARGS);
 extern Datum ts_rankcd_ttf(PG_FUNCTION_ARGS);
 extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS);

+extern Datum ts_typanalyze(PG_FUNCTION_ARGS);
+

 /*
 * TSQuery

--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out