Introduce parse_ident()

SQL-layer function to split qualified identifier into array parts. Author: Pavel Stehule with minor editorization by me and Jim Nasby

Introduce parse_ident()
SQL-layer function to split qualified identifier into array parts. Author: Pavel Stehule with minor editorization by me and Jim Nasby
3187d6de · Teodor Sigaev · 992b5ba3 · 3187d6de · 3187d6de · 3187d6de
Commit 3187d6de authored Mar 18, 2016 by Teodor Sigaev
10 changed files
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1818,6 +1818,32 @@
       <entry><literal>900150983cd24fb0 d6963f7d28e17f72</literal></entry>
      </row>
+      <row>
+       <entry>
+        <indexterm>
+         <primary>parse_ident</primary>
+        </indexterm>
+        <literal><function>parse_ident(<parameter>str</parameter> <type>text</type>,
+           [ <parameter>strictmode</parameter> <type>boolean</type> DEFAULT true ] )</function></literal>
+       </entry>
+       <entry><type>text[]</type></entry>
+       <entry>Split <parameter>qualified identifier</parameter> into array
+       <parameter>parts</parameter>. When <parameter>strictmode</parameter> is
+       false, extra characters after the identifier are ignored. This is useful
+       for parsing identifiers for objects like functions and arrays that may
+       have trailing characters. By default, extra characters after the last
+       identifier are considered an error, but if second parameter is false,
+       then chararacters after last identifier are ignored. Note that this
+       function does not truncate quoted identifiers. If you care about that
+       you should cast the result of this function to name[]. A non-printable
+       chararacters (like 0 to 31) are displayed as hexadecimal codes always,
+       what can be different from PostgreSQL internal SQL identifiers
+       processing, when the original escaped value is displayed.
+       </entry>
+       <entry><literal>parse_ident('"SomeSchema".someTable')</literal></entry>
+       <entry><literal>"SomeSchema,sometable"</literal></entry>
+      </row>
      <row>
       <entry>
        <indexterm>

--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -990,3 +990,10 @@ RETURNS jsonb
 LANGUAGE INTERNAL
 STRICT IMMUTABLE
 AS 'jsonb_set';
+CREATE OR REPLACE FUNCTION
+  parse_ident(str text, strict boolean DEFAULT true)
+RETURNS text[]
+LANGUAGE INTERNAL
+STRICT IMMUTABLE
+AS 'parse_ident';
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -129,6 +129,15 @@ scanstr(const char *s)
 */
 char *
 downcase_truncate_identifier(const char *ident, int len, bool warn)
+{
+	return downcase_identifier(ident, len, warn, true);
+}
+/*
+ * a workhorse for downcase_truncate_identifier
+ */
+char *
+downcase_identifier(const char *ident, int len, bool warn, bool truncate)
 {
 	char	   *result;
 	int			i;
@@ -158,12 +167,13 @@ downcase_truncate_identifier(const char *ident, int len, bool warn)
 	}
 	result[i] = '\0';
-	if (i >= NAMEDATALEN)
+	if (i >= NAMEDATALEN && truncate)
 		truncate_identifier(result, i, warn);
 	return result;
 }
 /*
 * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
 *

--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -27,6 +27,7 @@
 #include "commands/dbcommands.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "parser/scansup.h"
 #include "parser/keywords.h"
 #include "postmaster/syslogger.h"
 #include "rewrite/rewriteHandler.h"
@@ -719,3 +720,226 @@ pg_column_is_updatable(PG_FUNCTION_ARGS)
 	PG_RETURN_BOOL((events & REQ_EVENTS) == REQ_EVENTS);
 }
+/*
+ * This simple parser utility are compatible with lexer implementation,
+ * used only in parse_ident function
+ */
+static bool
+is_ident_start(unsigned char c)
+{
+	if (c == '_')
+		return true;
+	if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
+		return true;
+	if (c >= 0200 && c <= 0377)
+		return true;
+	return false;
+}
+static bool
+is_ident_cont(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return true;
+	return is_ident_start(c);
+}
+/*
+ * Sanitize SQL string for using in error message.
+ */
+static char *
+sanitize_text(text *t)
+{
+	int			len = VARSIZE_ANY_EXHDR(t);
+	const char *p = VARDATA_ANY(t);
+	StringInfo	dstr;
+	dstr = makeStringInfo();
+	appendStringInfoChar(dstr, '"');
+	while (len--)
+	{
+		switch (*p)
+		{
+			case '\b':
+				appendStringInfoString(dstr, "\\b");
+				break;
+			case '\f':
+				appendStringInfoString(dstr, "\\f");
+				break;
+			case '\n':
+				appendStringInfoString(dstr, "\\n");
+				break;
+			case '\r':
+				appendStringInfoString(dstr, "\\r");
+				break;
+			case '\t':
+				appendStringInfoString(dstr, "\\t");
+				break;
+			case '\'':
+				appendStringInfoString(dstr, "''");
+				break;
+			case '\\':
+				appendStringInfoString(dstr, "\\\\");
+				break;
+			default:
+				if ((unsigned char) *p < ' ')
+					appendStringInfo(dstr, "\\u%04x", (int) *p);
+				else
+					appendStringInfoCharMacro(dstr, *p);
+				break;
+		}
+		p++;
+	}
+	appendStringInfoChar(dstr, '"');
+	return dstr->data;
+}
+/*
+ * parse_ident - parse SQL composed identifier to separate identifiers.
+ * When strict mode is active (second parameter), then any chars after
+ * last identifiers are disallowed.
+ */
+Datum
+parse_ident(PG_FUNCTION_ARGS)
+{
+	text	   *qualname;
+	char	   *qualname_str;
+	bool		strict;
+	char	   *nextp;
+	bool		after_dot = false;
+	ArrayBuildState *astate = NULL;
+	qualname = PG_GETARG_TEXT_PP(0);
+	qualname_str = text_to_cstring(qualname);
+	strict = PG_GETARG_BOOL(1);
+	nextp = qualname_str;
+	/* skip leading whitespace */
+	while (isspace((unsigned char) *nextp))
+		nextp++;
+	for (;;)
+	{
+		char		*curname;
+		char		*endp;
+		bool		missing_ident;
+		missing_ident = true;
+		if (*nextp == '\"')
+		{
+			curname = nextp + 1;
+			for (;;)
+			{
+				endp = strchr(nextp + 1, '\"');
+				if (endp == NULL)
+					ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("unclosed double quotes"),
+						 errdetail("string %s is not valid identifier",
+									sanitize_text(qualname))));
+				if (endp[1] != '\"')
+					break;
+				memmove(endp, endp + 1, strlen(endp));
+				nextp = endp;
+			}
+			nextp = endp + 1;
+			*endp = '\0';
+			/* Show complete input string in this case. */
+			if (endp - curname == 0)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("identifier should not be empty: %s",
+							sanitize_text(qualname))));
+			astate = accumArrayResult(astate, CStringGetTextDatum(curname),
+									  false, TEXTOID, CurrentMemoryContext);
+			missing_ident = false;
+		}
+		else
+		{
+			if (is_ident_start((unsigned char) *nextp))
+			{
+				char *downname;
+				int	len;
+				text	*part;
+				curname = nextp++;
+				while (is_ident_cont((unsigned char) *nextp))
+					nextp++;
+				len = nextp - curname;
+				/*
+				 * Unlike name, we don't implicitly truncate identifiers. This
+				 * is useful for allowing the user to check for specific parts
+				 * of the identifier being too long. It's easy enough for the
+				 * user to get the truncated names by casting our output to
+				 * name[].
+				 */
+				downname = downcase_identifier(curname, len, false, false);
+				part = cstring_to_text_with_len(downname, len);
+				astate = accumArrayResult(astate, PointerGetDatum(part), false,
+										  TEXTOID, CurrentMemoryContext);
+				missing_ident = false;
+			}
+		}
+		if (missing_ident)
+		{
+			/* Different error messages based on where we failed. */
+			if (*nextp == '.')
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("missing valid identifier before \".\" symbol: %s",
+							sanitize_text(qualname))));
+			else if (after_dot)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("missing valid identifier after \".\" symbol: %s",
+							sanitize_text(qualname))));
+			else
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("missing valid identifier: %s",
+							sanitize_text(qualname))));
+		}
+		while (isspace((unsigned char) *nextp))
+			nextp++;
+		if (*nextp == '.')
+		{
+			after_dot = true;
+			nextp++;
+			while (isspace((unsigned char) *nextp))
+				nextp++;
+		}
+		else if (*nextp == '\0')
+		{
+			break;
+		}
+		else
+		{
+			if (strict)
+				ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("identifier contains disallowed characters: %s",
+							sanitize_text(qualname))));
+			break;
+		}
+	}
+	PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
+}
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
 */
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201603151
+#define CATALOG_VERSION_NO	201603181
 #endif
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -3463,6 +3463,9 @@ DESCR("I/O");
 DATA(insert OID = 4086 (  to_regnamespace	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 4089 "25" _null_ _null_ _null_ _null_ _null_ to_regnamespace _null_ _null_ _null_ ));
 DESCR("convert namespace name to regnamespace");
+DATA(insert OID = 1268 (  parse_ident		PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 1009 "25 16" _null_ _null_ "{str,strict}" _null_ _null_ parse_ident _null_ _null_ _null_ ));
+DESCR("parse qualified identifier to array of identifiers");
 DATA(insert OID = 2246 ( fmgr_internal_validator PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_internal_validator _null_ _null_ _null_ ));
 DESCR("(internal)");
 DATA(insert OID = 2247 ( fmgr_c_validator	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2278 "26" _null_ _null_ _null_ _null_ _null_ fmgr_c_validator _null_ _null_ _null_ ));

--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -20,6 +20,9 @@ extern char *scanstr(const char *s);
 extern char *downcase_truncate_identifier(const char *ident, int len,
 							 bool warn);
+extern char *downcase_identifier(const char *ident, int len,
+							 bool warn, bool truncate);
 extern void truncate_identifier(char *ident, int len, bool warn);
 extern bool scanner_isspace(char ch);

--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -510,6 +510,7 @@ extern Datum pg_typeof(PG_FUNCTION_ARGS);
 extern Datum pg_collation_for(PG_FUNCTION_ARGS);
 extern Datum pg_relation_is_updatable(PG_FUNCTION_ARGS);
 extern Datum pg_column_is_updatable(PG_FUNCTION_ARGS);
+extern Datum parse_ident(PG_FUNCTION_ARGS);
 /* oid.c */
 extern Datum oidin(PG_FUNCTION_ARGS);

--- a/src/test/regress/expected/name.out
+++ b/src/test/regress/expected/name.out
@@ -124,3 +124,69 @@ SELECT '' AS two, c.f1 FROM NAME_TBL c WHERE c.f1 ~ '.*asdf.*';
 (2 rows)
 DROP TABLE NAME_TBL;
+DO $$
+DECLARE r text[];
+BEGIN
+  r := parse_ident('Schemax.Tabley');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+  r := parse_ident('"SchemaX"."TableY"');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+END;
+$$;
+NOTICE:  schemax.tabley
+NOTICE:  "SchemaX"."TableY"
+SELECT parse_ident('foo.boo');
+ parse_ident 
+-------------
+ {foo,boo}
+(1 row)
+SELECT parse_ident('foo.boo[]'); -- should fail
+ERROR:  identifier contains disallowed characters: "foo.boo[]"
+SELECT parse_ident('foo.boo[]', strict => false); -- ok
+ parse_ident 
+-------------
+ {foo,boo}
+(1 row)
+-- should fail
+SELECT parse_ident(' ');
+ERROR:  missing valid identifier: " "
+SELECT parse_ident(' .aaa');
+ERROR:  missing valid identifier before "." symbol: " .aaa"
+SELECT parse_ident(' aaa . ');
+ERROR:  missing valid identifier after "." symbol: " aaa . "
+SELECT parse_ident('aaa.a%b');
+ERROR:  identifier contains disallowed characters: "aaa.a%b"
+SELECT parse_ident(E'X\rXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX');
+ERROR:  identifier contains disallowed characters: "X\rXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
+SELECT length(a[1]), length(a[2]) from parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy') as a ;
+ length | length 
+--------+--------
+    414 |    289
+(1 row)
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"');
+                                                parse_ident                                                
+-----------------------------------------------------------------------------------------------------------
+ {first,"  second  ","   third   ","  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
+(1 row)
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"')::name[];
+                                             parse_ident                                              
+------------------------------------------------------------------------------------------------------
+ {first,"  second  ","   third   ","  xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
+(1 row)
+SELECT parse_ident(E'"c".X XXXX\002XXXXXX');
+ERROR:  identifier contains disallowed characters: ""c".X XXXX\u0002XXXXXX"
+SELECT parse_ident('1020');
+ERROR:  missing valid identifier: "1020"
+SELECT parse_ident('10.20');
+ERROR:  missing valid identifier: "10.20"
+SELECT parse_ident('.');
+ERROR:  missing valid identifier before "." symbol: "."
+SELECT parse_ident('.1020');
+ERROR:  missing valid identifier before "." symbol: ".1020"
+SELECT parse_ident('xxx.1020');
+ERROR:  missing valid identifier after "." symbol: "xxx.1020"
--- a/src/test/regress/sql/name.sql
+++ b/src/test/regress/sql/name.sql
@@ -52,3 +52,36 @@ SELECT '' AS three, c.f1 FROM NAME_TBL c WHERE c.f1 ~ '[0-9]';
 SELECT '' AS two, c.f1 FROM NAME_TBL c WHERE c.f1 ~ '.*asdf.*';
 DROP TABLE NAME_TBL;
+DO $$
+DECLARE r text[];
+BEGIN
+  r := parse_ident('Schemax.Tabley');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+  r := parse_ident('"SchemaX"."TableY"');
+  RAISE NOTICE '%', format('%I.%I', r[1], r[2]);
+END;
+$$;
+SELECT parse_ident('foo.boo');
+SELECT parse_ident('foo.boo[]'); -- should fail
+SELECT parse_ident('foo.boo[]', strict => false); -- ok
+-- should fail
+SELECT parse_ident(' ');
+SELECT parse_ident(' .aaa');
+SELECT parse_ident(' aaa . ');
+SELECT parse_ident('aaa.a%b');
+SELECT parse_ident(E'X\rXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX');
+SELECT length(a[1]), length(a[2]) from parse_ident('"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx".yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy') as a ;
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"');
+SELECT parse_ident(' first . "  second  " ."   third   ". "  ' || repeat('x',66) || '"')::name[];
+SELECT parse_ident(E'"c".X XXXX\002XXXXXX');
+SELECT parse_ident('1020');
+SELECT parse_ident('10.20');
+SELECT parse_ident('.');
+SELECT parse_ident('.1020');
+SELECT parse_ident('xxx.1020');