Commit 2a0af7fe authored by Tom Lane's avatar Tom Lane

Allow complemented character class escapes within regex brackets.

The complement-class escapes \D, \S, \W are now allowed within
bracket expressions.  There is no semantic difficulty with doing
that, but the rather hokey macro-expansion-based implementation
previously used here couldn't cope.

Also, invent "word" as an allowed character class name, thus "\w"
is now equivalent to "[[:word:]]" outside brackets, or "[:word:]"
within brackets.  POSIX allows such implementation-specific
extensions, and the same name is used in e.g. bash.

One surprising compatibility issue this raises is that constructs
such as "[\w-_]" are now disallowed, as our documentation has always
said they should be: character classes can't be endpoints of a range.
Previously, because \w was just a macro for "[:alnum:]_", such a
construct was read as "[[:alnum:]_-_]", so it was accepted so long as
the character after "-" was numerically greater than or equal to "_".

Some implementation cleanup along the way:

* Remove the lexnest() hack, and in consequence clean up wordchrs()
to not interact with the lexer.

* Fix colorcomplement() to not be O(N^2) in the number of colors
involved.

* Get rid of useless-as-far-as-I-can-see calls of element()
on single-character character element names in brackpart().
element() always maps these to the character itself, and things
would be quite broken if it didn't --- should "[a]" match something
different than "a" does?  Besides, the shortcut path in brackpart()
wasn't doing this anyway, making it even more inconsistent.

Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
parent 6b40d9bd
......@@ -6097,6 +6097,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
non-ASCII characters to belong to any of these classes.)
In addition to these standard character
classes, <productname>PostgreSQL</productname> defines
the <literal>word</literal> character class, which is the same as
<literal>alnum</literal> plus the underscore (<literal>_</literal>)
character, and
the <literal>ascii</literal> character class, which contains exactly
the 7-bit ASCII set.
</para>
......@@ -6108,9 +6111,9 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
matching empty strings at the beginning
and end of a word respectively. A word is defined as a sequence
of word characters that is neither preceded nor followed by word
characters. A word character is an <literal>alnum</literal> character (as
defined by the <acronym>POSIX</acronym> character class described above)
or an underscore. This is an extension, compatible with but not
characters. A word character is any character belonging to the
<literal>word</literal> character class, that is, any letter, digit,
or underscore. This is an extension, compatible with but not
specified by <acronym>POSIX</acronym> 1003.2, and should be used with
caution in software intended to be portable to other systems.
The constraint escapes described below are usually preferable; they
......@@ -6330,8 +6333,7 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
<row>
<entry> <literal>\w</literal> </entry>
<entry> <literal>[[:alnum:]_]</literal>
(note underscore is included) </entry>
<entry> <literal>[[:word:]]</literal> </entry>
</row>
<row>
......@@ -6346,21 +6348,18 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', '\s*') AS foo;
<row>
<entry> <literal>\W</literal> </entry>
<entry> <literal>[^[:alnum:]_]</literal>
(note underscore is included) </entry>
<entry> <literal>[^[:word:]]</literal> </entry>
</row>
</tbody>
</tgroup>
</table>
<para>
Within bracket expressions, <literal>\d</literal>, <literal>\s</literal>,
and <literal>\w</literal> lose their outer brackets,
and <literal>\D</literal>, <literal>\S</literal>, and <literal>\W</literal> are illegal.
(So, for example, <literal>[a-c\d]</literal> is equivalent to
The class-shorthand escapes also work within bracket expressions,
although the definitions shown above are not quite syntactically
valid in that context.
For example, <literal>[a-c\d]</literal> is equivalent to
<literal>[a-c[:digit:]]</literal>.
Also, <literal>[a-c\D]</literal>, which is equivalent to
<literal>[a-c^[:digit:]]</literal>, is illegal.)
</para>
<table id="posix-constraint-escapes-table">
......
......@@ -519,15 +519,10 @@ character classes:
(note underscore)
.RE
.PP
Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
and `\fB\ew\fR'\&
lose their outer brackets,
and `\fB\eD\fR', `\fB\eS\fR',
and `\fB\eW\fR'\&
are illegal.
.VS 8.2
(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
The class-shorthand escapes also work within bracket expressions,
although the definitions shown above are not quite syntactically
valid in that context.
For example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
.VE 8.2
.PP
A constraint escape (AREs only) is a constraint,
......
......@@ -936,7 +936,16 @@ okcolors(struct nfa *nfa,
}
else if (cd->nschrs == 0 && cd->nuchrs == 0)
{
/* parent empty, its arcs change color to subcolor */
/*
* Parent is now empty, so just change all its arcs to the
* subcolor, then free the parent.
*
* It is not obvious that simply relabeling the arcs like this is
* OK; it appears to risk creating duplicate arcs. We are
* basically relying on the assumption that processing of a
* bracket expression can't create arcs of both a color and its
* subcolor between the bracket's endpoints.
*/
cd->sub = NOSUB;
scd = &cm->cd[sco];
assert(scd->nschrs > 0 || scd->nuchrs > 0);
......@@ -1062,6 +1071,7 @@ colorcomplement(struct nfa *nfa,
struct colordesc *cd;
struct colordesc *end = CDEND(cm);
color co;
struct arc *a;
assert(of != from);
......@@ -1069,10 +1079,26 @@ colorcomplement(struct nfa *nfa,
if (findarc(of, PLAIN, RAINBOW) != NULL)
return;
/* Otherwise, transiently mark the colors that appear in of's out-arcs */
for (a = of->outs; a != NULL; a = a->outchain)
{
if (a->type == PLAIN)
{
assert(a->co >= 0);
cd = &cm->cd[a->co];
assert(!UNUSEDCOLOR(cd));
cd->flags |= COLMARK;
}
}
/* Scan colors, clear transient marks, add arcs for unmarked colors */
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
if (findarc(of, PLAIN, co) == NULL)
{
if (cd->flags & COLMARK)
cd->flags &= ~COLMARK;
else if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
newarc(nfa, type, co, from, to);
}
}
......
......@@ -193,83 +193,6 @@ prefixes(struct vars *v)
}
}
/*
* lexnest - "call a subroutine", interpolating string at the lexical level
*
* Note, this is not a very general facility. There are a number of
* implicit assumptions about what sorts of strings can be subroutines.
*/
static void
lexnest(struct vars *v,
const chr *beginp, /* start of interpolation */
const chr *endp) /* one past end of interpolation */
{
assert(v->savenow == NULL); /* only one level of nesting */
v->savenow = v->now;
v->savestop = v->stop;
v->now = beginp;
v->stop = endp;
}
/*
* string constants to interpolate as expansions of things like \d
*/
static const chr backd[] = { /* \d */
CHR('['), CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr backD[] = { /* \D */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr brbackd[] = { /* \d within brackets */
CHR('['), CHR(':'),
CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
CHR(':'), CHR(']')
};
static const chr backs[] = { /* \s */
CHR('['), CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr backS[] = { /* \S */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']'), CHR(']')
};
static const chr brbacks[] = { /* \s within brackets */
CHR('['), CHR(':'),
CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
CHR(':'), CHR(']')
};
static const chr backw[] = { /* \w */
CHR('['), CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
CHR(':'), CHR(']'), CHR('_'), CHR(']')
};
static const chr backW[] = { /* \W */
CHR('['), CHR('^'), CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
CHR(':'), CHR(']'), CHR('_'), CHR(']')
};
static const chr brbackw[] = { /* \w within brackets */
CHR('['), CHR(':'),
CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
CHR(':'), CHR(']'), CHR('_')
};
/*
* lexword - interpolate a bracket expression for word characters
* Possibly ought to inquire whether there is a "word" character class.
*/
static void
lexword(struct vars *v)
{
lexnest(v, backw, ENDOF(backw));
}
/*
* next - get next token
*/
......@@ -292,14 +215,6 @@ next(struct vars *v)
RETV(SBEGIN, 0); /* same as \A */
}
/* if we're nested and we've hit end, return to outer level */
if (v->savenow != NULL && ATEOS())
{
v->now = v->savenow;
v->stop = v->savestop;
v->savenow = v->savestop = NULL;
}
/* skip white space etc. if appropriate (not in literal or []) */
if (v->cflags & REG_EXPANDED)
switch (v->lexcon)
......@@ -420,32 +335,15 @@ next(struct vars *v)
NOTE(REG_UNONPOSIX);
if (ATEOS())
FAILW(REG_EESCAPE);
(DISCARD) lexescape(v);
if (!lexescape(v))
return 0;
switch (v->nexttype)
{ /* not all escapes okay here */
case PLAIN:
case CCLASSS:
case CCLASSC:
return 1;
break;
case CCLASS:
switch (v->nextvalue)
{
case 'd':
lexnest(v, brbackd, ENDOF(brbackd));
break;
case 's':
lexnest(v, brbacks, ENDOF(brbacks));
break;
case 'w':
lexnest(v, brbackw, ENDOF(brbackw));
break;
default:
FAILW(REG_EESCAPE);
break;
}
/* lexnest done, back up and try again */
v->nexttype = v->lasttype;
return next(v);
break;
}
/* not one of the acceptable escapes */
FAILW(REG_EESCAPE);
......@@ -691,49 +589,17 @@ next(struct vars *v)
}
RETV(PLAIN, *v->now++);
}
(DISCARD) lexescape(v);
if (ISERR())
FAILW(REG_EESCAPE);
if (v->nexttype == CCLASS)
{ /* fudge at lexical level */
switch (v->nextvalue)
{
case 'd':
lexnest(v, backd, ENDOF(backd));
break;
case 'D':
lexnest(v, backD, ENDOF(backD));
break;
case 's':
lexnest(v, backs, ENDOF(backs));
break;
case 'S':
lexnest(v, backS, ENDOF(backS));
break;
case 'w':
lexnest(v, backw, ENDOF(backw));
break;
case 'W':
lexnest(v, backW, ENDOF(backW));
break;
default:
assert(NOTREACHED);
FAILW(REG_ASSERT);
break;
}
/* lexnest done, back up and try again */
v->nexttype = v->lasttype;
return next(v);
}
/* otherwise, lexescape has already done the work */
return !ISERR();
return lexescape(v);
}
/*
* lexescape - parse an ARE backslash escape (backslash already eaten)
* Note slightly nonstandard use of the CCLASS type code.
*
* This is used for ARE backslashes both normally and inside bracket
* expressions. In the latter case, not all escape types are allowed,
* but the caller must reject unwanted ones after we return.
*/
static int /* not actually used, but convenient for RETV */
static int
lexescape(struct vars *v)
{
chr c;
......@@ -775,11 +641,11 @@ lexescape(struct vars *v)
break;
case CHR('d'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'd');
RETV(CCLASSS, CC_DIGIT);
break;
case CHR('D'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'D');
RETV(CCLASSC, CC_DIGIT);
break;
case CHR('e'):
NOTE(REG_UUNPORT);
......@@ -802,11 +668,11 @@ lexescape(struct vars *v)
break;
case CHR('s'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 's');
RETV(CCLASSS, CC_SPACE);
break;
case CHR('S'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'S');
RETV(CCLASSC, CC_SPACE);
break;
case CHR('t'):
RETV(PLAIN, CHR('\t'));
......@@ -828,11 +694,11 @@ lexescape(struct vars *v)
break;
case CHR('w'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'w');
RETV(CCLASSS, CC_WORD);
break;
case CHR('W'):
NOTE(REG_ULOCALE);
RETV(CCLASS, 'W');
RETV(CCLASSC, CC_WORD);
break;
case CHR('x'):
NOTE(REG_UUNPORT);
......
......@@ -350,17 +350,13 @@ static const struct cname
};
/*
* The following arrays define the valid character class names.
* The following array defines the valid character class names.
* The entries must match enum char_classes in regguts.h.
*/
static const char *const classNames[NUM_CCLASSES + 1] = {
"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
"lower", "print", "punct", "space", "upper", "xdigit", NULL
};
enum classes
{
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
"lower", "print", "punct", "space", "upper", "xdigit", "word",
NULL
};
/*
......@@ -536,54 +532,58 @@ eclass(struct vars *v, /* context */
}
/*
* cclass - supply cvec for a character class
*
* Must include case counterparts if "cases" is true.
* lookupcclass - lookup a character class identified by name
*
* The returned cvec might be either a transient cvec gotten from getcvec(),
* or a permanently cached one from pg_ctype_get_cache(). This is okay
* because callers are not supposed to explicitly free the result either way.
* On failure, sets an error code in *v; the result is then garbage.
*/
static struct cvec *
cclass(struct vars *v, /* context */
static enum char_classes
lookupcclass(struct vars *v, /* context (for returning errors) */
const chr *startp, /* where the name starts */
const chr *endp, /* just past the end of the name */
int cases) /* case-independent? */
const chr *endp) /* just past the end of the name */
{
size_t len;
struct cvec *cv = NULL;
const char *const *namePtr;
int i,
index;
int i;
/*
* Map the name to the corresponding enumerated value.
*/
len = endp - startp;
index = -1;
for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
{
if (strlen(*namePtr) == len &&
pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
{
index = i;
break;
return (enum char_classes) i;
}
}
if (index == -1)
{
ERR(REG_ECTYPE);
return NULL;
}
return (enum char_classes) 0;
}
/*
* cclasscvec - supply cvec for a character class
*
* Must include case counterparts if "cases" is true.
*
* The returned cvec might be either a transient cvec gotten from getcvec(),
* or a permanently cached one from pg_ctype_get_cache(). This is okay
* because callers are not supposed to explicitly free the result either way.
*/
static struct cvec *
cclasscvec(struct vars *v, /* context */
enum char_classes cclasscode, /* class to build a cvec for */
int cases) /* case-independent? */
{
struct cvec *cv = NULL;
/*
* Remap lower and upper to alpha if the match is case insensitive.
*/
if (cases &&
((enum classes) index == CC_LOWER ||
(enum classes) index == CC_UPPER))
index = (int) CC_ALPHA;
(cclasscode == CC_LOWER ||
cclasscode == CC_UPPER))
cclasscode = CC_ALPHA;
/*
* Now compute the character class contents. For classes that are based
......@@ -595,16 +595,19 @@ cclass(struct vars *v, /* context */
* NB: keep this code in sync with cclass_column_index(), below.
*/
switch ((enum classes) index)
switch (cclasscode)
{
case CC_PRINT:
cv = pg_ctype_get_cache(pg_wc_isprint, index);
cv = pg_ctype_get_cache(pg_wc_isprint, cclasscode);
break;
case CC_ALNUM:
cv = pg_ctype_get_cache(pg_wc_isalnum, index);
cv = pg_ctype_get_cache(pg_wc_isalnum, cclasscode);
break;
case CC_ALPHA:
cv = pg_ctype_get_cache(pg_wc_isalpha, index);
cv = pg_ctype_get_cache(pg_wc_isalpha, cclasscode);
break;
case CC_WORD:
cv = pg_ctype_get_cache(pg_wc_isword, cclasscode);
break;
case CC_ASCII:
/* hard-wired meaning */
......@@ -625,10 +628,10 @@ cclass(struct vars *v, /* context */
addrange(cv, 0x7f, 0x9f);
break;
case CC_DIGIT:
cv = pg_ctype_get_cache(pg_wc_isdigit, index);
cv = pg_ctype_get_cache(pg_wc_isdigit, cclasscode);
break;
case CC_PUNCT:
cv = pg_ctype_get_cache(pg_wc_ispunct, index);
cv = pg_ctype_get_cache(pg_wc_ispunct, cclasscode);
break;
case CC_XDIGIT:
......@@ -646,16 +649,16 @@ cclass(struct vars *v, /* context */
}
break;
case CC_SPACE:
cv = pg_ctype_get_cache(pg_wc_isspace, index);
cv = pg_ctype_get_cache(pg_wc_isspace, cclasscode);
break;
case CC_LOWER:
cv = pg_ctype_get_cache(pg_wc_islower, index);
cv = pg_ctype_get_cache(pg_wc_islower, cclasscode);
break;
case CC_UPPER:
cv = pg_ctype_get_cache(pg_wc_isupper, index);
cv = pg_ctype_get_cache(pg_wc_isupper, cclasscode);
break;
case CC_GRAPH:
cv = pg_ctype_get_cache(pg_wc_isgraph, index);
cv = pg_ctype_get_cache(pg_wc_isgraph, cclasscode);
break;
}
......@@ -678,7 +681,7 @@ cclass_column_index(struct colormap *cm, chr c)
/*
* Note: we should not see requests to consider cclasses that are not
* treated as locale-specific by cclass(), above.
* treated as locale-specific by cclasscvec(), above.
*/
if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
colnum |= cm->classbits[CC_PRINT];
......@@ -686,6 +689,8 @@ cclass_column_index(struct colormap *cm, chr c)
colnum |= cm->classbits[CC_ALNUM];
if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
colnum |= cm->classbits[CC_ALPHA];
if (cm->classbits[CC_WORD] && pg_wc_isword(c))
colnum |= cm->classbits[CC_WORD];
assert(cm->classbits[CC_ASCII] == 0);
assert(cm->classbits[CC_BLANK] == 0);
assert(cm->classbits[CC_CNTRL] == 0);
......
......@@ -400,6 +400,15 @@ pg_wc_isalnum(pg_wchar c)
return 0; /* can't get here, but keep compiler quiet */
}
static int
pg_wc_isword(pg_wchar c)
{
/* We define word characters as alnum class plus underscore */
if (c == CHR('_'))
return 1;
return pg_wc_isalnum(c);
}
static int
pg_wc_isupper(pg_wchar c)
{
......
This diff is collapsed.
......@@ -127,6 +127,18 @@
#define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
/*
* known character classes
*/
enum char_classes
{
CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT, CC_WORD
};
#define NUM_CCLASSES 14
/*
* As soon as possible, we map chrs into equivalence classes -- "colors" --
* which are of much more manageable number.
......@@ -164,12 +176,14 @@ struct colordesc
#define NOSUB COLORLESS /* value of "sub" when no open subcolor */
struct arc *arcs; /* chain of all arcs of this color */
chr firstchr; /* simple char first assigned to this color */
int flags; /* bit values defined next */
int flags; /* bitmask of the following flags: */
#define FREECOL 01 /* currently free */
#define PSEUDO 02 /* pseudocolor, no real chars */
#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
#define COLMARK 04 /* temporary marker used in some functions */
};
#define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
/*
* The color map itself
*
......@@ -199,8 +213,6 @@ struct colordesc
* appear in increasing chr-value order.
*/
#define NUM_CCLASSES 13 /* must match data in regc_locale.c */
typedef struct colormaprange
{
chr cmin; /* range represents cmin..cmax inclusive */
......
......@@ -1970,6 +1970,256 @@ select * from test_regex('a[\w]b', 'axb', 'LPE');
{axb}
(2 rows)
-- these should be invalid
select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
ERROR: invalid regular expression: invalid character range
select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
ERROR: invalid regular expression: invalid character range
select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
ERROR: invalid regular expression: invalid character range
select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
ERROR: invalid regular expression: invalid character range
-- test complemented char classes within brackets
select * from test_regex('[\D]', '0123456789abc*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{a}
(2 rows)
select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{0}
(2 rows)
select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{1}
(2 rows)
select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{1}
(2 rows)
select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{2}
(2 rows)
select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{2}
(2 rows)
select * from test_regex('\W', '0123456789abc_*', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{*}
(2 rows)
select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{*}
(2 rows)
select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
test_regex
--------------------------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE,REG_UEMPTYMATCH}
{"012 3456789abc_*"}
(2 rows)
-- check char classes' handling of newlines
select * from test_regex('\s+', E'abc \n def', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('\s+', E'abc \n def', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('[\s]+', E'abc \n def', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('[\s]+', E'abc \n def', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{" +
"}
(2 rows)
select * from test_regex('\S+', E'abc\ndef', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('\S+', E'abc\ndef', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('\d+', E'012\n345', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('\d+', E'012\n345', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('[\d]+', E'012\n345', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('[\d]+', E'012\n345', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{012}
(2 rows)
select * from test_regex('\D+', E'abc\ndef345', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{"abc +
def"}
(2 rows)
select * from test_regex('\D+', E'abc\ndef345', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{"abc +
def"}
(2 rows)
select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc}
(2 rows)
select * from test_regex('\w+', E'abc_012\ndef', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{abc_012}
(2 rows)
select * from test_regex('\W+', E'***\n@@@___', 'LP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{"*** +
@@@"}
(2 rows)
select * from test_regex('\W+', E'***\n@@@___', 'nLP');
test_regex
-------------------------------
{0,REG_UNONPOSIX,REG_ULOCALE}
{***}
(2 rows)
select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{"*** +
@@@"}
(2 rows)
select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
test_regex
----------------------------------------
{0,REG_UBBS,REG_UNONPOSIX,REG_ULOCALE}
{***}
(2 rows)
-- doing 13 "escapes"
-- expectError 13.1 & "a\\" EESCAPE
select * from test_regex('a\', '', '');
......
......@@ -597,6 +597,50 @@ select * from test_regex('a[\s]b', 'a b', 'LPE');
-- expectMatch 12.18 LPE {a[\w]b} axb axb
select * from test_regex('a[\w]b', 'axb', 'LPE');
-- these should be invalid
select * from test_regex('[\w-~]*', 'ab01_~-`**', 'LNPSE');
select * from test_regex('[~-\w]*', 'ab01_~-`**', 'LNPSE');
select * from test_regex('[[:alnum:]-~]*', 'ab01~-`**', 'LNS');
select * from test_regex('[~-[:alnum:]]*', 'ab01~-`**', 'LNS');
-- test complemented char classes within brackets
select * from test_regex('[\D]', '0123456789abc*', 'LPE');
select * from test_regex('[^\D]', 'abc0123456789*', 'LPE');
select * from test_regex('[1\D7]', '0123456789abc*', 'LPE');
select * from test_regex('[7\D1]', '0123456789abc*', 'LPE');
select * from test_regex('[^0\D1]', 'abc0123456789*', 'LPE');
select * from test_regex('[^1\D0]', 'abc0123456789*', 'LPE');
select * from test_regex('\W', '0123456789abc_*', 'LP');
select * from test_regex('[\W]', '0123456789abc_*', 'LPE');
select * from test_regex('[\s\S]*', '012 3456789abc_*', 'LNPE');
-- check char classes' handling of newlines
select * from test_regex('\s+', E'abc \n def', 'LP');
select * from test_regex('\s+', E'abc \n def', 'nLP');
select * from test_regex('[\s]+', E'abc \n def', 'LPE');
select * from test_regex('[\s]+', E'abc \n def', 'nLPE');
select * from test_regex('\S+', E'abc\ndef', 'LP');
select * from test_regex('\S+', E'abc\ndef', 'nLP');
select * from test_regex('[\S]+', E'abc\ndef', 'LPE');
select * from test_regex('[\S]+', E'abc\ndef', 'nLPE');
select * from test_regex('\d+', E'012\n345', 'LP');
select * from test_regex('\d+', E'012\n345', 'nLP');
select * from test_regex('[\d]+', E'012\n345', 'LPE');
select * from test_regex('[\d]+', E'012\n345', 'nLPE');
select * from test_regex('\D+', E'abc\ndef345', 'LP');
select * from test_regex('\D+', E'abc\ndef345', 'nLP');
select * from test_regex('[\D]+', E'abc\ndef345', 'LPE');
select * from test_regex('[\D]+', E'abc\ndef345', 'nLPE');
select * from test_regex('\w+', E'abc_012\ndef', 'LP');
select * from test_regex('\w+', E'abc_012\ndef', 'nLP');
select * from test_regex('[\w]+', E'abc_012\ndef', 'LPE');
select * from test_regex('[\w]+', E'abc_012\ndef', 'nLPE');
select * from test_regex('\W+', E'***\n@@@___', 'LP');
select * from test_regex('\W+', E'***\n@@@___', 'nLP');
select * from test_regex('[\W]+', E'***\n@@@___', 'LPE');
select * from test_regex('[\W]+', E'***\n@@@___', 'nLPE');
-- doing 13 "escapes"
-- expectError 13.1 & "a\\" EESCAPE
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment