Improve parser's one-extra-token lookahead mechanism.

There are a couple of places in our grammar that fail to be strict LALR(1), by requiring more than a single token of lookahead to decide what to do. Up to now we've dealt with that by using a filter between the lexer and parser that merges adjacent tokens into one in the places where two tokens of lookahead are necessary. But that creates a number of user-visible anomalies, for instance that you can't name a CTE "ordinality" because "WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into one token. I realized that there's a better way. In this patch, we still do the lookahead basically as before, but we never merge the second token into the first; we replace just the first token by a special lookahead symbol when one of the lookahead pairs is seen. This requires a couple extra productions in the grammar, but it involves fewer special tokens, so that the grammar tables come out a bit smaller than before. The filter logic is no slower than before, perhaps a bit faster. I also fixed the filter logic so that when backing up after a lookahead, the current token's terminator is correctly restored; this eliminates some weird behavior in error message issuance, as is shown by the one change in existing regression test outputs. I believe that this patch entirely eliminates odd behaviors caused by lookahead for WITH. It doesn't really improve the situation for NULLS followed by FIRST/LAST unfortunately: those sequences still act like a reserved word, even though there are cases where they should be seen as two ordinary identifiers, eg "SELECT nulls first FROM ...". I experimented with additional grammar hacks but couldn't find any simple solution for that. Still, this is better than before, and it seems much more likely that we *could* somehow solve the NULLS case on the basis of this filter behavior than the previous one.

Improve parser's one-extra-token lookahead mechanism.
There are a couple of places in our grammar that fail to be strict LALR(1), by requiring more than a single token of lookahead to decide what to do. Up to now we've dealt with that by using a filter between the lexer and parser that merges adjacent tokens into one in the places where two tokens of lookahead are necessary. But that creates a number of user-visible anomalies, for instance that you can't name a CTE "ordinality" because "WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into one token. I realized that there's a better way. In this patch, we still do the lookahead basically as before, but we never merge the second token into the first; we replace just the first token by a special lookahead symbol when one of the lookahead pairs is seen. This requires a couple extra productions in the grammar, but it involves fewer special tokens, so that the grammar tables come out a bit smaller than before. The filter logic is no slower than before, perhaps a bit faster. I also fixed the filter logic so that when backing up after a lookahead, the current token's terminator is correctly restored; this eliminates some weird behavior in error message issuance, as is shown by the one change in existing regression test outputs. I believe that this patch entirely eliminates odd behaviors caused by lookahead for WITH. It doesn't really improve the situation for NULLS followed by FIRST/LAST unfortunately: those sequences still act like a reserved word, even though there are cases where they should be seen as two ordinary identifiers, eg "SELECT nulls first FROM ...". I experimented with additional grammar hacks but couldn't find any simple solution for that. Still, this is better than before, and it seems much more likely that we *could* somehow solve the NULLS case on the basis of this filter behavior than the previous one.
d809fd00 · Tom Lane · 23a78352 · d809fd00 · d809fd00 · d809fd00
Commit d809fd00 authored Feb 24, 2015 by Tom Lane
8 changed files
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -633,9 +633,9 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 /*
 * The grammar thinks these are keywords, but they are not in the kwlist.h
 * list and so can never be entered directly.  The filter in parser.c
- * creates these tokens when required.
+ * creates these tokens when required (based on looking one token ahead).
 */
-%token			NULLS_FIRST NULLS_LAST WITH_ORDINALITY WITH_TIME
+%token			NULLS_LA WITH_LA
 /* Precedence: lowest to highest */
@@ -873,6 +873,7 @@ CreateRoleStmt:
 opt_with:	WITH									{}
+			| WITH_LA								{}
 			| /*EMPTY*/								{}
 		;
@@ -6673,8 +6674,8 @@ opt_asc_desc: ASC							{ $$ = SORTBY_ASC; }
 			| /*EMPTY*/						{ $$ = SORTBY_DEFAULT; }
 		;
-opt_nulls_order: NULLS_FIRST				{ $$ = SORTBY_NULLS_FIRST; }
+opt_nulls_order: NULLS_LA FIRST_P			{ $$ = SORTBY_NULLS_FIRST; }
-			| NULLS_LAST					{ $$ = SORTBY_NULLS_LAST; }
+			| NULLS_LA LAST_P				{ $$ = SORTBY_NULLS_LAST; }
 			| /*EMPTY*/						{ $$ = SORTBY_NULLS_DEFAULT; }
 		;
@@ -8923,7 +8924,7 @@ AlterTSDictionaryStmt:
 		;
 AlterTSConfigurationStmt:
-			ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list WITH any_name_list
+			ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list any_with any_name_list
 				{
 					AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
 					n->cfgname = $5;
@@ -8933,7 +8934,7 @@ AlterTSConfigurationStmt:
 					n->replace = false;
 					$$ = (Node*)n;
 				}
-			| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list WITH any_name_list
+			| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list any_with any_name_list
 				{
 					AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
 					n->cfgname = $5;
@@ -8943,7 +8944,7 @@ AlterTSConfigurationStmt:
 					n->replace = false;
 					$$ = (Node*)n;
 				}
-			| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name WITH any_name
+			| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name any_with any_name
 				{
 					AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
 					n->cfgname = $5;
@@ -8953,7 +8954,7 @@ AlterTSConfigurationStmt:
 					n->replace = true;
 					$$ = (Node*)n;
 				}
-			| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name WITH any_name
+			| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name any_with any_name
 				{
 					AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
 					n->cfgname = $5;
@@ -8981,6 +8982,11 @@ AlterTSConfigurationStmt:
 				}
 		;
+/* Use this if TIME or ORDINALITY after WITH should be taken as an identifier */
+any_with:	WITH									{}
+			| WITH_LA								{}
+		;
 /*****************************************************************************
 *
@@ -9891,6 +9897,8 @@ simple_select:
 *		AS (query) [ SEARCH or CYCLE clause ]
 *
 * We don't currently support the SEARCH or CYCLE clause.
+ *
+ * Recognizing WITH_LA here allows a CTE to be named TIME or ORDINALITY.
 */
 with_clause:
 		WITH cte_list
@@ -9900,6 +9908,13 @@ with_clause:
 				$$->recursive = false;
 				$$->location = @1;
 			}
+		| WITH_LA cte_list
+			{
+				$$ = makeNode(WithClause);
+				$$->ctes = $2;
+				$$->recursive = false;
+				$$->location = @1;
+			}
 		| WITH RECURSIVE cte_list
 			{
 				$$ = makeNode(WithClause);
@@ -10601,7 +10616,7 @@ opt_col_def_list: AS '(' TableFuncElementList ')'	{ $$ = $3; }
 			| /*EMPTY*/								{ $$ = NIL; }
 		;
-opt_ordinality: WITH_ORDINALITY						{ $$ = true; }
+opt_ordinality: WITH_LA ORDINALITY					{ $$ = true; }
 			| /*EMPTY*/								{ $$ = false; }
 		;
@@ -11057,7 +11072,7 @@ ConstInterval:
 		;
 opt_timezone:
-			WITH_TIME ZONE							{ $$ = TRUE; }
+			WITH_LA TIME ZONE						{ $$ = TRUE; }
 			| WITHOUT TIME ZONE						{ $$ = FALSE; }
 			| /*EMPTY*/								{ $$ = FALSE; }
 		;

--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -64,13 +64,13 @@ raw_parser(const char *str)
 /*
 * Intermediate filter between parser and core lexer (core_yylex in scan.l).
 *
- * The filter is needed because in some cases the standard SQL grammar
+ * This filter is needed because in some cases the standard SQL grammar
 * requires more than one token lookahead.  We reduce these cases to one-token
- * lookahead by combining tokens here, in order to keep the grammar LALR(1).
+ * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
 *
 * Using a filter is simpler than trying to recognize multiword tokens
 * directly in scan.l, because we'd have to allow for comments between the
- * words.  Furthermore it's not clear how to do it without re-introducing
+ * words.  Furthermore it's not clear how to do that without re-introducing
 * scanner backtrack, which would cost more performance than this filter
 * layer does.
 *
@@ -84,7 +84,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 	base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
 	int			cur_token;
 	int			next_token;
-	core_YYSTYPE cur_yylval;
+	int			cur_token_length;
 	YYLTYPE		cur_yylloc;
 	/* Get next token --- we might already have it */
@@ -93,74 +93,85 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 		cur_token = yyextra->lookahead_token;
 		lvalp->core_yystype = yyextra->lookahead_yylval;
 		*llocp = yyextra->lookahead_yylloc;
+		*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
 		yyextra->have_lookahead = false;
 	}
 	else
 		cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
-	/* Do we need to look ahead for a possible multiword token? */
+	/*
+	 * If this token isn't one that requires lookahead, just return it.  If it
+	 * does, determine the token length.  (We could get that via strlen(), but
+	 * since we have such a small set of possibilities, hardwiring seems
+	 * feasible and more efficient.)
+	 */
 	switch (cur_token)
 	{
 		case NULLS_P:
+			cur_token_length = 5;
+			break;
+		case WITH:
+			cur_token_length = 4;
+			break;
+		default:
+			return cur_token;
+	}
-			/*
+	/*
-			 * NULLS FIRST and NULLS LAST must be reduced to one token
+	 * Identify end+1 of current token.  core_yylex() has temporarily stored a
-			 */
+	 * '\0' here, and will undo that when we call it again.  We need to redo
-			cur_yylval = lvalp->core_yystype;
+	 * it to fully revert the lookahead call for error reporting purposes.
-			cur_yylloc = *llocp;
+	 */
-			next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
+	yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
+		*llocp + cur_token_length;
+	Assert(*(yyextra->lookahead_end) == '\0');
+	/*
+	 * Save and restore *llocp around the call.  It might look like we could
+	 * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
+	 * does not work because flex actually holds onto the last-passed pointer
+	 * internally, and will use that for error reporting.  We need any error
+	 * reports to point to the current token, not the next one.
+	 */
+	cur_yylloc = *llocp;
+	/* Get next token, saving outputs into lookahead variables */
+	next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
+	yyextra->lookahead_token = next_token;
+	yyextra->lookahead_yylloc = *llocp;
+	*llocp = cur_yylloc;
+	/* Now revert the un-truncation of the current token */
+	yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
+	*(yyextra->lookahead_end) = '\0';
+	yyextra->have_lookahead = true;
+	/* Replace cur_token if needed, based on lookahead */
+	switch (cur_token)
+	{
+		case NULLS_P:
+			/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
 			switch (next_token)
 			{
 				case FIRST_P:
-					cur_token = NULLS_FIRST;
-					break;
 				case LAST_P:
-					cur_token = NULLS_LAST;
+					cur_token = NULLS_LA;
-					break;
-				default:
-					/* save the lookahead token for next time */
-					yyextra->lookahead_token = next_token;
-					yyextra->lookahead_yylval = lvalp->core_yystype;
-					yyextra->lookahead_yylloc = *llocp;
-					yyextra->have_lookahead = true;
-					/* and back up the output info to cur_token */
-					lvalp->core_yystype = cur_yylval;
-					*llocp = cur_yylloc;
 					break;
 			}
 			break;
 		case WITH:
+			/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
-			/*
-			 * WITH TIME and WITH ORDINALITY must each be reduced to one token
-			 */
-			cur_yylval = lvalp->core_yystype;
-			cur_yylloc = *llocp;
-			next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
 			switch (next_token)
 			{
 				case TIME:
-					cur_token = WITH_TIME;
-					break;
 				case ORDINALITY:
-					cur_token = WITH_ORDINALITY;
+					cur_token = WITH_LA;
-					break;
-				default:
-					/* save the lookahead token for next time */
-					yyextra->lookahead_token = next_token;
-					yyextra->lookahead_yylval = lvalp->core_yystype;
-					yyextra->lookahead_yylloc = *llocp;
-					yyextra->have_lookahead = true;
-					/* and back up the output info to cur_token */
-					lvalp->core_yystype = cur_yylval;
-					*llocp = cur_yylloc;
 					break;
 			}
 			break;
-		default:
-			break;
 	}
 	return cur_token;

--- a/src/include/parser/gramparse.h
+++ b/src/include/parser/gramparse.h
@@ -46,6 +46,8 @@ typedef struct base_yy_extra_type
 	int			lookahead_token;	/* one-token lookahead */
 	core_YYSTYPE lookahead_yylval;		/* yylval for lookahead token */
 	YYLTYPE		lookahead_yylloc;		/* yylloc for lookahead token */
+	char	   *lookahead_end;	/* end of current token */
+	char		lookahead_hold_char;	/* to be put back at *lookahead_end */
 	/*
 	 * State variables that belong to the grammar.

--- a/src/interfaces/ecpg/preproc/parse.pl
+++ b/src/interfaces/ecpg/preproc/parse.pl
@@ -42,10 +42,8 @@ my %replace_token = (
 # or in the block
 my %replace_string = (
-	'WITH_TIME'       => 'with time',
+	'NULLS_LA'        => 'nulls',
-	'WITH_ORDINALITY' => 'with ordinality',
+	'WITH_LA'         => 'with',
-	'NULLS_FIRST'     => 'nulls first',
-	'NULLS_LAST'      => 'nulls last',
 	'TYPECAST'        => '::',
 	'DOT_DOT'         => '..',
 	'COLON_EQUALS'    => ':=',);

--- a/src/interfaces/ecpg/preproc/parser.c
+++ b/src/interfaces/ecpg/preproc/parser.c
@@ -3,11 +3,8 @@
 * parser.c
 *		Main entry point/driver for PostgreSQL grammar
 *
- * Note that the grammar is not allowed to perform any table access
+ * This should match src/backend/parser/parser.c, except that we do not
- * (since we need to be able to do basic parsing even while inside an
+ * need to bother with re-entrant interfaces.
- * aborted transaction).  Therefore, the data structures returned by
- * the grammar are "raw" parsetrees that still need to be analyzed by
- * analyze.c and related files.
 *
 *
 * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
@@ -29,18 +26,21 @@ static bool have_lookahead;		/* is lookahead info valid? */
 static int	lookahead_token;	/* one-token lookahead */
 static YYSTYPE lookahead_yylval;	/* yylval for lookahead token */
 static YYLTYPE lookahead_yylloc;	/* yylloc for lookahead token */
+static char *lookahead_yytext;	/* start current token */
+static char *lookahead_end;		/* end of current token */
+static char lookahead_hold_char;	/* to be put back at *lookahead_end */
 /*
 * Intermediate filter between parser and base lexer (base_yylex in scan.l).
 *
- * The filter is needed because in some cases the standard SQL grammar
+ * This filter is needed because in some cases the standard SQL grammar
 * requires more than one token lookahead.  We reduce these cases to one-token
- * lookahead by combining tokens here, in order to keep the grammar LALR(1).
+ * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
 *
 * Using a filter is simpler than trying to recognize multiword tokens
 * directly in scan.l, because we'd have to allow for comments between the
- * words.  Furthermore it's not clear how to do it without re-introducing
+ * words.  Furthermore it's not clear how to do that without re-introducing
 * scanner backtrack, which would cost more performance than this filter
 * layer does.
 */
@@ -49,8 +49,10 @@ filtered_base_yylex(void)
 {
 	int			cur_token;
 	int			next_token;
+	int			cur_token_length;
 	YYSTYPE		cur_yylval;
 	YYLTYPE		cur_yylloc;
+	char	   *cur_yytext;
 	/* Get next token --- we might already have it */
 	if (have_lookahead)
@@ -58,74 +60,86 @@ filtered_base_yylex(void)
 		cur_token = lookahead_token;
 		base_yylval = lookahead_yylval;
 		base_yylloc = lookahead_yylloc;
+		yytext = lookahead_yytext;
+		*lookahead_end = lookahead_hold_char;
 		have_lookahead = false;
 	}
 	else
 		cur_token = base_yylex();
-	/* Do we need to look ahead for a possible multiword token? */
+	/*
+	 * If this token isn't one that requires lookahead, just return it.  If it
+	 * does, determine the token length.  (We could get that via strlen(), but
+	 * since we have such a small set of possibilities, hardwiring seems
+	 * feasible and more efficient.)
+	 */
 	switch (cur_token)
 	{
 		case NULLS_P:
+			cur_token_length = 5;
+			break;
+		case WITH:
+			cur_token_length = 4;
+			break;
+		default:
+			return cur_token;
+	}
+	/*
+	 * Identify end+1 of current token.  base_yylex() has temporarily stored a
+	 * '\0' here, and will undo that when we call it again.  We need to redo
+	 * it to fully revert the lookahead call for error reporting purposes.
+	 */
+	lookahead_end = yytext + cur_token_length;
+	Assert(*lookahead_end == '\0');
+	/* Save and restore lexer output variables around the call */
+	cur_yylval = base_yylval;
+	cur_yylloc = base_yylloc;
+	cur_yytext = yytext;
+	/* Get next token, saving outputs into lookahead variables */
+	next_token = base_yylex();
+	lookahead_token = next_token;
+	lookahead_yylval = base_yylval;
+	lookahead_yylloc = base_yylloc;
+	lookahead_yytext = yytext;
+	base_yylval = cur_yylval;
+	base_yylloc = cur_yylloc;
+	yytext = cur_yytext;
+	/* Now revert the un-truncation of the current token */
+	lookahead_hold_char = *lookahead_end;
+	*lookahead_end = '\0';
+	have_lookahead = true;
-			/*
+	/* Replace cur_token if needed, based on lookahead */
-			 * NULLS FIRST and NULLS LAST must be reduced to one token
+	switch (cur_token)
-			 */
+	{
-			cur_yylval = base_yylval;
+		case NULLS_P:
-			cur_yylloc = base_yylloc;
+			/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
-			next_token = base_yylex();
 			switch (next_token)
 			{
 				case FIRST_P:
-					cur_token = NULLS_FIRST;
-					break;
 				case LAST_P:
-					cur_token = NULLS_LAST;
+					cur_token = NULLS_LA;
-					break;
-				default:
-					/* save the lookahead token for next time */
-					lookahead_token = next_token;
-					lookahead_yylval = base_yylval;
-					lookahead_yylloc = base_yylloc;
-					have_lookahead = true;
-					/* and back up the output info to cur_token */
-					base_yylval = cur_yylval;
-					base_yylloc = cur_yylloc;
 					break;
 			}
 			break;
 		case WITH:
+			/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
-			/*
-			 * WITH TIME must be reduced to one token
-			 */
-			cur_yylval = base_yylval;
-			cur_yylloc = base_yylloc;
-			next_token = base_yylex();
 			switch (next_token)
 			{
 				case TIME:
-					cur_token = WITH_TIME;
-					break;
 				case ORDINALITY:
-					cur_token = WITH_ORDINALITY;
+					cur_token = WITH_LA;
-					break;
-				default:
-					/* save the lookahead token for next time */
-					lookahead_token = next_token;
-					lookahead_yylval = base_yylval;
-					lookahead_yylloc = base_yylloc;
-					have_lookahead = true;
-					/* and back up the output info to cur_token */
-					base_yylval = cur_yylval;
-					base_yylloc = cur_yylloc;
 					break;
 			}
 			break;
-		default:
-			break;
 	}
 	return cur_token;

--- a/src/test/regress/expected/foreign_data.out
+++ b/src/test/regress/expected/foreign_data.out
@@ -663,7 +663,7 @@ LINE 1: CREATE FOREIGN TABLE ft1 ();
 CREATE FOREIGN TABLE ft1 () SERVER no_server;                   -- ERROR
 ERROR:  server "no_server" does not exist
 CREATE FOREIGN TABLE ft1 () SERVER s0 WITH OIDS;                -- ERROR
-ERROR:  syntax error at or near "WITH OIDS"
+ERROR:  syntax error at or near "WITH"
 LINE 1: CREATE FOREIGN TABLE ft1 () SERVER s0 WITH OIDS;
                                              ^
 CREATE FOREIGN TABLE ft1 (

--- a/src/test/regress/expected/with.out
+++ b/src/test/regress/expected/with.out
@@ -2155,3 +2155,18 @@ WITH t AS (
 VALUES(FALSE);
 ERROR:  conditional DO INSTEAD rules are not supported for data-modifying statements in WITH
 DROP RULE y_rule ON y;
+-- check that parser lookahead for WITH doesn't cause any odd behavior
+create table foo (with baz);  -- fail, WITH is a reserved word
+ERROR:  syntax error at or near "with"
+LINE 1: create table foo (with baz);
+                          ^
+create table foo (with ordinality);  -- fail, WITH is a reserved word
+ERROR:  syntax error at or near "with"
+LINE 1: create table foo (with ordinality);
+                          ^
+with ordinality as (select 1 as x) select * from ordinality;
+ x 
+---
+ 1
+(1 row)
--- a/src/test/regress/sql/with.sql
+++ b/src/test/regress/sql/with.sql
@@ -956,3 +956,8 @@ WITH t AS (
 )
 VALUES(FALSE);
 DROP RULE y_rule ON y;
+-- check that parser lookahead for WITH doesn't cause any odd behavior
+create table foo (with baz);  -- fail, WITH is a reserved word
+create table foo (with ordinality);  -- fail, WITH is a reserved word
+with ordinality as (select 1 as x) select * from ordinality;