Commit 6f6d8632 authored by Tom Lane's avatar Tom Lane

Create a type-specific typanalyze routine for tsvector, which collects stats

on the most common individual lexemes in place of the mostly-useless default
behavior of counting duplicate tsvectors.  Future work: create selectivity
estimation functions that actually do something with these stats.

(Some other things we ought to look at doing: using the Lossy Counting
algorithm in compute_minimal_stats, and using the element-counting idea for
stats on regular arrays.)

Jan Urbanski
parent 6816577a
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.167 2008/07/11 07:02:43 petere Exp $ -->
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.168 2008/07/14 00:51:45 tgl Exp $ -->
<!--
Documentation of the system catalogs, directed toward PostgreSQL developers
-->
......@@ -6516,6 +6516,8 @@
<entry>
A list of the most common values in the column. (NULL if
no values seem to be more common than any others.)
For some datatypes such as <type>tsvector</>, this is a list of
the most common element values rather than values of the type itself.
</entry>
</row>
......@@ -6524,10 +6526,10 @@
<entry><type>real[]</type></entry>
<entry></entry>
<entry>
A list of the frequencies of the most common values,
A list of the frequencies of the most common values or elements,
i.e., number of occurrences of each divided by total number of rows.
(NULL when <structfield>most_common_vals</structfield> is.)
</entry>
</entry>
</row>
<row>
......
......@@ -3,7 +3,7 @@
*
* Copyright (c) 1996-2008, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.52 2008/05/15 00:17:39 tgl Exp $
* $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.53 2008/07/14 00:51:45 tgl Exp $
*/
CREATE VIEW pg_roles AS
......@@ -110,30 +110,30 @@ CREATE VIEW pg_stats AS
stanullfrac AS null_frac,
stawidth AS avg_width,
stadistinct AS n_distinct,
CASE 1
WHEN stakind1 THEN stavalues1
WHEN stakind2 THEN stavalues2
WHEN stakind3 THEN stavalues3
WHEN stakind4 THEN stavalues4
END AS most_common_vals,
CASE 1
WHEN stakind1 THEN stanumbers1
WHEN stakind2 THEN stanumbers2
WHEN stakind3 THEN stanumbers3
WHEN stakind4 THEN stanumbers4
END AS most_common_freqs,
CASE 2
WHEN stakind1 THEN stavalues1
WHEN stakind2 THEN stavalues2
WHEN stakind3 THEN stavalues3
WHEN stakind4 THEN stavalues4
END AS histogram_bounds,
CASE 3
WHEN stakind1 THEN stanumbers1[1]
WHEN stakind2 THEN stanumbers2[1]
WHEN stakind3 THEN stanumbers3[1]
WHEN stakind4 THEN stanumbers4[1]
END AS correlation
CASE
WHEN stakind1 IN (1, 4) THEN stavalues1
WHEN stakind2 IN (1, 4) THEN stavalues2
WHEN stakind3 IN (1, 4) THEN stavalues3
WHEN stakind4 IN (1, 4) THEN stavalues4
END AS most_common_vals,
CASE
WHEN stakind1 IN (1, 4) THEN stanumbers1
WHEN stakind2 IN (1, 4) THEN stanumbers2
WHEN stakind3 IN (1, 4) THEN stanumbers3
WHEN stakind4 IN (1, 4) THEN stanumbers4
END AS most_common_freqs,
CASE
WHEN stakind1 = 2 THEN stavalues1
WHEN stakind2 = 2 THEN stavalues2
WHEN stakind3 = 2 THEN stavalues3
WHEN stakind4 = 2 THEN stavalues4
END AS histogram_bounds,
CASE
WHEN stakind1 = 3 THEN stanumbers1[1]
WHEN stakind2 = 3 THEN stanumbers2[1]
WHEN stakind3 = 3 THEN stanumbers3[1]
WHEN stakind4 = 3 THEN stanumbers4[1]
END AS correlation
FROM pg_statistic s JOIN pg_class c ON (c.oid = s.starelid)
JOIN pg_attribute a ON (c.oid = attrelid AND attnum = s.staattnum)
LEFT JOIN pg_namespace n ON (n.oid = c.relnamespace)
......
......@@ -4,7 +4,7 @@
#
# Copyright (c) 2006-2008, PostgreSQL Global Development Group
#
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.6 2008/02/19 10:30:08 petere Exp $
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.7 2008/07/14 00:51:45 tgl Exp $
#
#-------------------------------------------------------------------------
subdir = src/backend/tsearch
......@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \
dict_ispell.o regis.o spell.o \
to_tsany.o ts_utils.o
to_tsany.o ts_typanalyze.o ts_utils.o
include $(top_srcdir)/src/backend/common.mk
......
This diff is collapsed.
......@@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.466 2008/07/11 21:06:29 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.467 2008/07/14 00:51:45 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 200807111
#define CATALOG_VERSION_NO 200807131
#endif
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/pg_operator.h,v 1.160 2008/06/17 19:10:56 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/pg_operator.h,v 1.161 2008/07/14 00:51:45 tgl Exp $
*
* NOTES
* the genbki.sh script reads this file and generates .bki
......@@ -105,6 +105,7 @@ DATA(insert OID = 95 ( "<" PGNSP PGUID b f f 21 21 16 520 524 int2lt scalar
DATA(insert OID = 96 ( "=" PGNSP PGUID b t t 23 23 16 96 518 int4eq eqsel eqjoinsel ));
DATA(insert OID = 97 ( "<" PGNSP PGUID b f f 23 23 16 521 525 int4lt scalarltsel scalarltjoinsel ));
DATA(insert OID = 98 ( "=" PGNSP PGUID b t t 25 25 16 98 531 texteq eqsel eqjoinsel ));
#define TextEqualOperator 98
DATA(insert OID = 349 ( "||" PGNSP PGUID b f f 2277 2283 2277 0 0 array_append - - ));
DATA(insert OID = 374 ( "||" PGNSP PGUID b f f 2283 2277 2277 0 0 array_prepend - - ));
......
......@@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.504 2008/07/03 20:58:46 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.505 2008/07/14 00:51:45 tgl Exp $
*
* NOTES
* The script catalog/genbki.sh reads this file and generates .bki
......@@ -4313,6 +4313,9 @@ DESCR("GiST tsquery support");
DATA(insert OID = 3701 ( gtsquery_consistent PGNSP PGUID 12 1 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ gtsquery_consistent - _null_ _null_ ));
DESCR("GiST tsquery support");
DATA(insert OID = 3688 ( ts_typanalyze PGNSP PGUID 12 1 0 f f t f s 1 16 "2281" _null_ _null_ _null_ ts_typanalyze - _null_ _null_ ));
DESCR("tsvector typanalyze");
DATA(insert OID = 3689 ( ts_stat PGNSP PGUID 12 10 10000 f f t t v 1 2249 "25" "{25,25,23,23}" "{i,o,o,o}" "{query,word,ndoc,nentry}" ts_stat1 - _null_ _null_ ));
DESCR("statistics of tsvector column");
DATA(insert OID = 3690 ( ts_stat PGNSP PGUID 12 10 10000 f f t t v 2 2249 "25 25" "{25,25,25,23,23}" "{i,i,o,o,o}" "{query,weights,word,ndoc,nentry}" ts_stat2 - _null_ _null_ ));
......
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/pg_statistic.h,v 1.35 2008/03/27 03:57:34 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/pg_statistic.h,v 1.36 2008/07/14 00:51:45 tgl Exp $
*
* NOTES
* the genbki.sh script reads this file and generates .bki
......@@ -237,4 +237,19 @@ typedef FormData_pg_statistic *Form_pg_statistic;
*/
#define STATISTIC_KIND_CORRELATION 3
/*
* A "most common elements" slot is similar to a "most common values" slot,
* except that it stores the most common non-null *elements* of the column
* values. This is useful when the column datatype is an array or some other
* type with identifiable elements (for instance, tsvector). staop contains
* the equality operator appropriate to the element type. stavalues contains
* the most common element values, and stanumbers their frequencies, with the
* same rules as for MCV slots.
*
* Note: in current usage for tsvector columns, the stavalues elements are of
* type text, even though their representation within tsvector is not
* exactly text.
*/
#define STATISTIC_KIND_MCELEM 4
#endif /* PG_STATISTIC_H */
......@@ -8,7 +8,7 @@
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/pg_type.h,v 1.196 2008/06/24 17:58:27 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/pg_type.h,v 1.197 2008/07/14 00:51:45 tgl Exp $
*
* NOTES
* the genbki.sh script reads this file and generates .bki
......@@ -543,7 +543,7 @@ DESCR("UUID datatype");
DATA(insert OID = 2951 ( _uuid PGNSP PGUID -1 f b t \054 0 2950 0 array_in array_out array_recv array_send - - - i x f 0 -1 0 _null_ _null_ ));
/* text search */
DATA(insert OID = 3614 ( tsvector PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - - i x f 0 -1 0 _null_ _null_ ));
DATA(insert OID = 3614 ( tsvector PGNSP PGUID -1 f b t \054 0 0 3643 tsvectorin tsvectorout tsvectorrecv tsvectorsend - - ts_typanalyze i x f 0 -1 0 _null_ _null_ ));
DESCR("text representation for text search");
#define TSVECTOROID 3614
DATA(insert OID = 3642 ( gtsvector PGNSP PGUID -1 f b t \054 0 0 3644 gtsvectorin gtsvectorout - - - - - i p f 0 -1 0 _null_ _null_ ));
......
......@@ -5,7 +5,7 @@
*
* Copyright (c) 1998-2008, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.12 2008/06/10 08:55:50 heikki Exp $
* $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.13 2008/07/14 00:51:45 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -153,6 +153,8 @@ extern Datum ts_rankcd_wtt(PG_FUNCTION_ARGS);
extern Datum ts_rankcd_ttf(PG_FUNCTION_ARGS);
extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS);
extern Datum ts_typanalyze(PG_FUNCTION_ARGS);
/*
* TSQuery
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment