Add a new, improved version of citext as a contrib module.

David E. Wheeler

Add a new, improved version of citext as a contrib module.
David E. Wheeler
ab9907f5 · Tom Lane · 6fe87963 · ab9907f5 · ab9907f5 · ab9907f5
Commit ab9907f5 authored Jul 29, 2008 by Tom Lane
11 changed files
--- a/contrib/Makefile
+++ b/contrib/Makefile
-# $PostgreSQL: pgsql/contrib/Makefile,v 1.83 2008/05/08 16:49:36 tgl Exp $
+# $PostgreSQL: pgsql/contrib/Makefile,v 1.84 2008/07/29 18:31:20 tgl Exp $
 subdir = contrib
 top_builddir = ..
@@ -8,6 +8,7 @@ WANTED_DIRS = \
 		adminpack	\
 		btree_gist	\
 		chkpass		\
+		citext		\
 		cube		\
 		dblink		\
 		dict_int	\

--- a/contrib/README
+++ b/contrib/README
@@ -36,6 +36,10 @@ chkpass -
 	An auto-encrypted password datatype
 	by D'Arcy J.M. Cain <darcy@druid.net>
+citext -
+	A case-insensitive character string datatype
+	by David E. Wheeler <david@kineticode.com>
 cube -
 	Multidimensional-cube datatype (GiST indexing example)
 	by Gene Selkov, Jr. <selkovjr@mcs.anl.gov>

--- a/contrib/citext/Makefile
+++ b/contrib/citext/Makefile
+# $PostgreSQL: pgsql/contrib/citext/Makefile,v 1.1 2008/07/29 18:31:20 tgl Exp $
+MODULES = citext
+DATA_built = citext.sql
+DATA = uninstall_citext.sql
+REGRESS = citext
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/citext
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
--- a/contrib/citext/citext.c
+++ b/contrib/citext/citext.c
+/*
+ * $PostgreSQL: pgsql/contrib/citext/citext.c,v 1.1 2008/07/29 18:31:20 tgl Exp $
+ */
+#include "postgres.h"
+#include "access/hash.h"
+#include "fmgr.h"
+#include "utils/builtins.h"
+#include "utils/formatting.h"
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+/*
+ *      ====================
+ *      FORWARD DECLARATIONS
+ *      ====================
+ */
+static int32  citextcmp      (text *left, text *right);
+extern Datum  citext_cmp     (PG_FUNCTION_ARGS);
+extern Datum  citext_hash    (PG_FUNCTION_ARGS);
+extern Datum  citext_eq      (PG_FUNCTION_ARGS);
+extern Datum  citext_ne      (PG_FUNCTION_ARGS);
+extern Datum  citext_gt      (PG_FUNCTION_ARGS);
+extern Datum  citext_ge      (PG_FUNCTION_ARGS);
+extern Datum  citext_lt      (PG_FUNCTION_ARGS);
+extern Datum  citext_le      (PG_FUNCTION_ARGS);
+extern Datum  citext_smaller (PG_FUNCTION_ARGS);
+extern Datum  citext_larger  (PG_FUNCTION_ARGS);
+/*
+ *      =================
+ *      UTILITY FUNCTIONS
+ *      =================
+ */
+/*
+ * citextcmp()
+ * Internal comparison function for citext strings.
+ * Returns int32 negative, zero, or positive.
+ */
+static int32
+citextcmp (text *left, text *right)
+{
+   char   *lcstr, *rcstr;
+   int32	result;
+   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left));
+   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right));
+   result = varstr_cmp(lcstr, strlen(lcstr),
+					   rcstr, strlen(rcstr));
+   pfree(lcstr);
+   pfree(rcstr);
+   return result;
+}
+/*
+ *      ==================
+ *      INDEXING FUNCTIONS
+ *      ==================
+ */
+PG_FUNCTION_INFO_V1(citext_cmp);
+Datum
+citext_cmp(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   int32 result;
+   result = citextcmp(left, right);
+   PG_FREE_IF_COPY(left, 0);
+   PG_FREE_IF_COPY(right, 1);
+   PG_RETURN_INT32(result);
+}
+PG_FUNCTION_INFO_V1(citext_hash);
+Datum
+citext_hash(PG_FUNCTION_ARGS)
+{
+   text       *txt = PG_GETARG_TEXT_PP(0);
+   char       *str;
+   Datum       result;
+   str    = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt));
+   result = hash_any((unsigned char *) str, strlen(str));
+   pfree(str);
+   /* Avoid leaking memory for toasted inputs */
+   PG_FREE_IF_COPY(txt, 0);
+   PG_RETURN_DATUM(result);
+}
+/*
+ *      ==================
+ *      OPERATOR FUNCTIONS
+ *      ==================
+ */
+PG_FUNCTION_INFO_V1(citext_eq);
+Datum
+citext_eq(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   char *lcstr, *rcstr;
+   bool  result;
+   /* We can't compare lengths in advance of downcasing ... */
+   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left));
+   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right));
+   /*
+    * Since we only care about equality or not-equality, we can
+    * avoid all the expense of strcoll() here, and just do bitwise
+    * comparison.
+    */
+   result = (strcmp(lcstr, rcstr) == 0);
+   pfree(lcstr);
+   pfree(rcstr);
+   PG_FREE_IF_COPY(left, 0);
+   PG_FREE_IF_COPY(right, 1);
+   PG_RETURN_BOOL(result);
+}
+PG_FUNCTION_INFO_V1(citext_ne);
+Datum
+citext_ne(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   char *lcstr, *rcstr;
+   bool  result;
+   /* We can't compare lengths in advance of downcasing ... */
+   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left));
+   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right));
+   /*
+    * Since we only care about equality or not-equality, we can
+    * avoid all the expense of strcoll() here, and just do bitwise
+    * comparison.
+    */
+   result = (strcmp(lcstr, rcstr) != 0);
+   pfree(lcstr);
+   pfree(rcstr);
+   PG_FREE_IF_COPY(left, 0);
+   PG_FREE_IF_COPY(right, 1);
+   PG_RETURN_BOOL(result);
+}
+PG_FUNCTION_INFO_V1(citext_lt);
+Datum
+citext_lt(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   bool  result;
+   result = citextcmp(left, right) < 0;
+   PG_FREE_IF_COPY(left, 0);
+   PG_FREE_IF_COPY(right, 1);
+   PG_RETURN_BOOL(result);
+}
+PG_FUNCTION_INFO_V1(citext_le);
+Datum
+citext_le(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   bool  result;
+   result = citextcmp(left, right) <= 0;
+   PG_FREE_IF_COPY(left, 0);
+   PG_FREE_IF_COPY(right, 1);
+   PG_RETURN_BOOL(result);
+}
+PG_FUNCTION_INFO_V1(citext_gt);
+Datum
+citext_gt(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   bool  result;
+   result = citextcmp(left, right) > 0;
+   PG_FREE_IF_COPY(left, 0);
+   PG_FREE_IF_COPY(right, 1);
+   PG_RETURN_BOOL(result);
+}
+PG_FUNCTION_INFO_V1(citext_ge);
+Datum
+citext_ge(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   bool  result;
+   result = citextcmp(left, right) >= 0;
+   PG_FREE_IF_COPY(left, 0);
+   PG_FREE_IF_COPY(right, 1);
+   PG_RETURN_BOOL(result);
+}
+/*
+ *      ===================
+ *      AGGREGATE FUNCTIONS
+ *      ===================
+ */
+PG_FUNCTION_INFO_V1(citext_smaller);
+Datum
+citext_smaller(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   text *result;
+   result = citextcmp(left, right) < 0 ? left : right;
+   PG_RETURN_TEXT_P(result);
+}
+PG_FUNCTION_INFO_V1(citext_larger);
+Datum
+citext_larger(PG_FUNCTION_ARGS)
+{
+   text *left  = PG_GETARG_TEXT_PP(0);
+   text *right = PG_GETARG_TEXT_PP(1);
+   text *result;
+   result = citextcmp(left, right) > 0 ? left : right;
+   PG_RETURN_TEXT_P(result);
+}
--- a/contrib/citext/citext.sql.in
+++ b/contrib/citext/citext.sql.in
--- a/contrib/citext/expected/citext.out
+++ b/contrib/citext/expected/citext.out
--- a/contrib/citext/sql/citext.sql
+++ b/contrib/citext/sql/citext.sql
--- a/contrib/citext/uninstall_citext.sql
+++ b/contrib/citext/uninstall_citext.sql
+/* $PostgreSQL: pgsql/contrib/citext/uninstall_citext.sql,v 1.1 2008/07/29 18:31:20 tgl Exp $ */
+-- Adjust this setting to control where the objects get dropped.
+SET search_path = public;
+DROP OPERATOR CLASS citext_ops USING btree CASCADE;
+DROP OPERATOR CLASS citext_ops USING hash CASCADE;
+DROP AGGREGATE min(citext);
+DROP AGGREGATE max(citext);
+DROP OPERATOR = (citext, citext);
+DROP OPERATOR <> (citext, citext);
+DROP OPERATOR < (citext, citext);
+DROP OPERATOR <= (citext, citext);
+DROP OPERATOR >= (citext, citext);
+DROP OPERATOR > (citext, citext);
+DROP OPERATOR || (citext, citext);
+DROP OPERATOR ~ (citext, citext);
+DROP OPERATOR ~* (citext, citext);
+DROP OPERATOR !~ (citext, citext);
+DROP OPERATOR !~* (citext, citext);
+DROP OPERATOR ~~ (citext, citext);
+DROP OPERATOR ~~* (citext, citext);
+DROP OPERATOR !~~ (citext, citext);
+DROP OPERATOR !~~* (citext, citext);
+DROP OPERATOR ~ (citext, text);
+DROP OPERATOR ~* (citext, text);
+DROP OPERATOR !~ (citext, text);
+DROP OPERATOR !~* (citext, text);
+DROP OPERATOR ~~ (citext, text);
+DROP OPERATOR ~~* (citext, text);
+DROP OPERATOR !~~ (citext, text);
+DROP OPERATOR !~~* (citext, text);
+DROP CAST (citext AS text);
+DROP CAST (citext AS varchar);
+DROP CAST (citext AS bpchar);
+DROP CAST (text AS citext);
+DROP CAST (varchar AS citext);
+DROP CAST (bpchar AS citext);
+DROP FUNCTION citext(bpchar);
+DROP FUNCTION citext_eq(citext, citext);
+DROP FUNCTION citext_ne(citext, citext);
+DROP FUNCTION citext_lt(citext, citext);
+DROP FUNCTION citext_le(citext, citext);
+DROP FUNCTION citext_gt(citext, citext);
+DROP FUNCTION citext_ge(citext, citext);
+DROP FUNCTION textcat(citext, citext);
+DROP FUNCTION citext_cmp(citext, citext);
+DROP FUNCTION citext_hash(citext);
+DROP FUNCTION citext_smaller(citext, citext);
+DROP FUNCTION citext_larger(citext, citext);
+DROP FUNCTION lower(citext);
+DROP FUNCTION upper(citext);
+DROP FUNCTION quote_literal(citext);
+DROP FUNCTION texticlike(citext, citext);
+DROP FUNCTION texticnlike(citext, citext);
+DROP FUNCTION texticregexeq(citext, citext);
+DROP FUNCTION texticregexne(citext, citext);
+DROP FUNCTION texticlike(citext, text);
+DROP FUNCTION texticnlike(citext, text);
+DROP FUNCTION texticregexeq(citext, text);
+DROP FUNCTION texticregexne(citext, text);
+DROP TYPE citext CASCADE;
--- a/doc/src/sgml/citext.sgml
+++ b/doc/src/sgml/citext.sgml
+<!-- $PostgreSQL: pgsql/doc/src/sgml/citext.sgml,v 1.1 2008/07/29 18:31:20 tgl Exp $ -->
+<sect1 id="citext">
+ <title>citext</title>
+ <indexterm zone="citext">
+  <primary>citext</primary>
+ </indexterm>
+ <para>
+  The <filename>citext</> module provides a case-insensitive
+  character string type, <type>citext</>. Essentially, it internally calls
+  <function>lower</> when comparing values. Otherwise, it behaves almost
+  exactly like <type>text</>.
+ </para>
+ <sect2>
+  <title>Rationale</title>
+  <para>
+   The standard approach to doing case-insensitive matches
+   in <productname>PostgreSQL</> has been to use the <function>lower</>
+   function when comparing values, for example
+  <programlisting>
+   SELECT * FROM tab WHERE lower(col) = LOWER(?);
+  </programlisting>
+  </para>
+  <para>
+   This works reasonably well, but has a number of drawbacks:
+  </para>
+   <itemizedlist>
+    <listitem>
+     <para>
+      It makes your SQL statements verbose, and you always have to remember to
+      use <function>lower</> on both the column and the query value.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      It won't use an index, unless you create a functional index using
+      <function>lower</>.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      If you declare a column as <literal>UNIQUE</> or <literal>PRIMARY
+      KEY</>, the implicitly generated index is case-sensitive.  So it's
+      useless for case-insensitive searches, and it won't enforce
+      uniqueness case-insensitively.
+     </para>
+    </listitem>
+   </itemizedlist>
+   <para>
+    The <type>citext</> data type allows you to eliminate calls
+    to <function>lower</> in SQL queries, and allows a primary key to
+    be case-insensitive. <type>citext</> is locale-aware, just
+    like <type>text</>, which means that the comparison of uppercase and
+    lowercase characters is dependent on the rules of
+    the <literal>LC_CTYPE</> locale setting. Again, this behavior is
+    identical to the use of <function>lower</> in queries. But because it's
+    done transparently by the datatype, you don't have to remember to do
+    anything special in your queries.
+   </para>
+ </sect2>
+ <sect2>
+  <title>How to Use It</title>
+  <para>
+   Here's a simple example of usage:
+  <programlisting>
+   CREATE TABLE users (
+       nick CITEXT PRIMARY KEY,
+       pass TEXT   NOT NULL
+   );
+   INSERT INTO users VALUES ( 'larry',  md5(random()::text) );
+   INSERT INTO users VALUES ( 'Tom',    md5(random()::text) );
+   INSERT INTO users VALUES ( 'Damian', md5(random()::text) );
+   INSERT INTO users VALUES ( 'NEAL',   md5(random()::text) );
+   INSERT INTO users VALUES ( 'Bj&oslash;rn',  md5(random()::text) );
+   SELECT * FROM users WHERE nick = 'Larry';
+  </programlisting>
+   The <command>SELECT</> statement will return one tuple, even though
+   the <structfield>nick</> column was set to <quote>larry</> and the query
+   was for <quote>Larry</>.
+  </para>
+ </sect2>
+ <sect2>
+  <title>Limitations</title>
+   <itemizedlist>
+    <listitem>
+     <para>
+      <type>citext</>'s behavior depends on
+      the <literal>LC_CTYPE</> setting of your database. How it compares
+      values is therefore determined when
+      <application>initdb</> is run to create the cluster. It is not truly
+      case-insensitive in the terms defined by the Unicode standard.
+      Effectively, what this means is that, as long as you're happy with your
+      collation, you should be happy with <type>citext</>'s comparisons. But
+      if you have data in different languages stored in your database, users
+      of one language may find their query results are not as expected if the
+      collation is for another language.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      The pattern-matching comparison operators, such as <literal>LIKE</>,
+      <literal>~</>, <literal>~~</>, <literal>!~</>, <literal>!~~</>, etc.,
+      have been overloaded to make case-insensitive comparisons when their
+      left-hand argument is of type <type>citext</>.  However, related
+      functions have not been changed, including:
+     </para>
+     <itemizedlist>
+      <listitem>
+       <para>
+         <function>regexp_replace()</>
+       </para>
+      </listitem>
+      <listitem>
+       <para>
+         <function>regexp_split_to_array()</>
+       </para>
+      </listitem>
+      <listitem>
+       <para>
+         <function>regexp_split_to_table()</>
+       </para>
+      </listitem>
+      <listitem>
+       <para>
+         <function>replace()</>
+       </para>
+      </listitem>
+      <listitem>
+       <para>
+         <function>split_part()</>
+       </para>
+      </listitem>
+      <listitem>
+       <para>
+         <function>strpos()</>
+       </para>
+      </listitem>
+      <listitem>
+       <para>
+         <function>translate()</>
+       </para>
+      </listitem>
+     </itemizedlist>
+     <para>
+      Of course, for the regular expression functions, you can specify
+      case-insensitive comparisons in their <parameter>flags</> arguments, but
+      the same cannot be done for the the non-regexp functions.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+       <type>citext</> is not as efficient as <type>text</> because the
+       operator functions and the btree comparison functions must make copies
+       of the data and convert it to lower case for comparisons. It is,
+       however, slightly more efficient than using <function>lower</> to get
+       case-insensitive matching.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      <productname>PostgreSQL</> supports casting between <type>text</>
+      and any other type (even non-string types) by using the other type's
+      I/O functions.  This doesn't work with <type>citext</>.  However,
+      you can cast via I/O functions in two steps, for example
+      <literal><replaceable>somevalue</>::text::citext</literal> or
+      <literal><replaceable>citextvalue</>::text::<replaceable>sometype</></literal>.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      <type>citext</> doesn't help much if you need data to compare
+      case-sensitively in some contexts and case-insensitively in other
+      contexts.  The standard answer is to use the <type>text</> type and
+      manually use the <function>lower</> function when you need to compare
+      case-insensitively; this works all right if case-insensitive comparison
+      is needed only infrequently.  If you need case-insensitive most of
+      the time and case-sensitive infrequently, consider storing the data
+      as <type>citext</> and explicitly casting the column to <type>text</>
+      when you want case-sensitive comparison.  In either situation, you
+      will need two indexes if you want both types of searches to be fast.
+    </para>
+    </listitem>
+   </itemizedlist>
+ </sect2>
+ <sect2>
+  <title>Author</title>
+  <para>
+   David E. Wheeler <email>david@kineticode.com</email>
+  </para>
+  <para>
+    Inspired by the original <type>citext</> module by Donald Fraser.
+  </para>
+ </sect2>
+</sect1>
--- a/doc/src/sgml/contrib.sgml
+++ b/doc/src/sgml/contrib.sgml
-<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.8 2007/12/06 04:12:09 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.9 2008/07/29 18:31:20 tgl Exp $ -->
 <appendix id="contrib">
 <title>Additional Supplied Modules</title>
@@ -81,6 +81,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
 &adminpack;
 &btree-gist;
 &chkpass;
+ &citext;
 &cube;
 &dblink;
 &dict-int;

--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
-<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.56 2007/12/03 04:18:47 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.57 2008/07/29 18:31:20 tgl Exp $ -->
 <!entity history    SYSTEM "history.sgml">
 <!entity info       SYSTEM "info.sgml">
@@ -94,6 +94,7 @@
 <!entity adminpack       SYSTEM "adminpack.sgml">
 <!entity btree-gist      SYSTEM "btree-gist.sgml">
 <!entity chkpass         SYSTEM "chkpass.sgml">
+<!entity citext          SYSTEM "citext.sgml">
 <!entity cube            SYSTEM "cube.sgml">
 <!entity dblink          SYSTEM "dblink.sgml">
 <!entity dict-int        SYSTEM "dict-int.sgml">