Commit eb5834d5 authored by Tom Lane's avatar Tom Lane

Further improvement of make_greater_string.

Make sure that it considers all the possibilities that the old code did,
instead of trying only one possibility per character position.  To keep the
runtime in bounds, instead tweak the character incrementers to not try
every possible multibyte character code.  Remove unnecessary logic to
restore the old character value on failure.  Additional comment and
formatting cleanup.
parent fae54e4a
...@@ -5701,13 +5701,23 @@ byte_increment(unsigned char *ptr, int len) ...@@ -5701,13 +5701,23 @@ byte_increment(unsigned char *ptr, int len)
* and "9" is seen as largest by the collation, and append that to the given * and "9" is seen as largest by the collation, and append that to the given
* prefix before trying to find a string that compares as larger. * prefix before trying to find a string that compares as larger.
* *
* If we max out the righthand byte, truncate off the last character * To search for a greater string, we repeatedly "increment" the rightmost
* and start incrementing the next. For example, if "z" were the last * character, using an encoding-specific character incrementer function.
* character in the sort order, then we could produce "foo" as a * When it's no longer possible to increment the last character, we truncate
* string greater than "fonz". * off that character and start incrementing the next-to-rightmost.
* For example, if "z" were the last character in the sort order, then we
* could produce "foo" as a string greater than "fonz".
* *
* This could be rather slow in the worst case, but in most cases we * This could be rather slow in the worst case, but in most cases we
* won't have to try more than one or two strings before succeeding. * won't have to try more than one or two strings before succeeding.
*
* Note that it's important for the character incrementer not to be too anal
* about producing every possible character code, since in some cases the only
* way to get a larger string is to increment a previous character position.
* So we don't want to spend too much time trying every possible character
* code at the last position. A good rule of thumb is to be sure that we
* don't try more than 256*K values for a K-byte character (and definitely
* not 256^K, which is what an exhaustive search would approach).
*/ */
Const * Const *
make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
...@@ -5779,17 +5789,19 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) ...@@ -5779,17 +5789,19 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
} }
} }
/* Select appropriate character-incrementer function */
if (datatype == BYTEAOID) if (datatype == BYTEAOID)
charinc = &byte_increment; charinc = byte_increment;
else else
charinc = pg_database_encoding_character_incrementer(); charinc = pg_database_encoding_character_incrementer();
/* And search ... */
while (len > 0) while (len > 0)
{ {
int charlen; int charlen;
unsigned char *lastchar; unsigned char *lastchar;
Const *workstr_const;
/* Identify the last character --- for bytea, just the last byte */
if (datatype == BYTEAOID) if (datatype == BYTEAOID)
charlen = 1; charlen = 1;
else else
...@@ -5799,9 +5811,15 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) ...@@ -5799,9 +5811,15 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
/* /*
* Try to generate a larger string by incrementing the last character * Try to generate a larger string by incrementing the last character
* (for BYTEA, we treat each byte as a character). * (for BYTEA, we treat each byte as a character).
*
* Note: the incrementer function is expected to return true if it's
* generated a valid-per-the-encoding new character, otherwise false.
* The contents of the character on false return are unspecified.
*/ */
if (charinc(lastchar, charlen)) while (charinc(lastchar, charlen))
{ {
Const *workstr_const;
if (datatype == BYTEAOID) if (datatype == BYTEAOID)
workstr_const = string_to_bytea_const(workstr, len); workstr_const = string_to_bytea_const(workstr, len);
else else
...@@ -5825,7 +5843,8 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation) ...@@ -5825,7 +5843,8 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
} }
/* /*
* Truncate off the last character or byte. * No luck here, so truncate off the last character and try to
* increment the next one.
*/ */
len -= charlen; len -= charlen;
workstr[len] = '\0'; workstr[len] = '\0';
......
...@@ -1337,85 +1337,78 @@ pg_utf8_islegal(const unsigned char *source, int length) ...@@ -1337,85 +1337,78 @@ pg_utf8_islegal(const unsigned char *source, int length)
#ifndef FRONTEND #ifndef FRONTEND
/* /*
* Generic character increment function. * Generic character incrementer function.
* *
* Not knowing anything about the properties of the encoding in use, we just * Not knowing anything about the properties of the encoding in use, we just
* keep incrementing the last byte until pg_verifymbstr() likes the result, * keep incrementing the last byte until we get a validly-encoded result,
* or we run out of values to try. * or we run out of values to try. We don't bother to try incrementing
* * higher-order bytes, so there's no growth in runtime for wider characters.
* Like all character-increment functions, we must restore the original input * (If we did try to do that, we'd need to consider the likelihood that 255
* string on failure. * is not a valid final byte in the encoding.)
*/ */
static bool static bool
pg_generic_charinc(unsigned char *charptr, int len) pg_generic_charinc(unsigned char *charptr, int len)
{ {
unsigned char *lastchar = (unsigned char *) (charptr + len - 1); unsigned char *lastbyte = charptr + len - 1;
unsigned char savelastchar = *lastchar; mbverifier mbverify;
const char *const_charptr = (const char *)charptr;
/* We can just invoke the character verifier directly. */
while (*lastchar < (unsigned char) 255) mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
{
(*lastchar)++; while (*lastbyte < (unsigned char) 255)
if (!pg_verifymbstr(const_charptr, len, true)) {
continue; (*lastbyte)++;
return true; if ((*mbverify) (charptr, len) == len)
} return true;
}
*lastchar = savelastchar;
return false; return false;
} }
/* /*
* UTF-8 character increment function. * UTF-8 character incrementer function.
* *
* For a one-byte character less than 0x7F, we just increment the byte. * For a one-byte character less than 0x7F, we just increment the byte.
* *
* For a multibyte character, every byte but the first must fall between 0x80 * For a multibyte character, every byte but the first must fall between 0x80
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
* the last byte that's not already at its maximum value, and set any following * the last byte that's not already at its maximum value. If we can't find a
* bytes back to 0x80. If we can't find a byte that's less than the maximum * byte that's less than the maximum allowable value, we simply fail. We also
* allowable vale, we simply fail. We also have some special-case logic to * need some special-case logic to skip regions used for surrogate pair
* skip regions used for surrogate pair handling, as those should not occur in * handling, as those should not occur in valid UTF-8.
* valid UTF-8.
* *
* Like all character-increment functions, we must restore the original input * Note that we don't reset lower-order bytes back to their minimums, since
* string on failure. * we can't afford to make an exhaustive search (see make_greater_string).
*/ */
static bool static bool
pg_utf8_increment(unsigned char *charptr, int length) pg_utf8_increment(unsigned char *charptr, int length)
{ {
unsigned char a; unsigned char a;
unsigned char bak[4];
unsigned char limit; unsigned char limit;
switch (length) switch (length)
{ {
default: default:
/* reject lengths 5 and 6 for now */ /* reject lengths 5 and 6 for now */
return false; return false;
case 4: case 4:
bak[3] = charptr[3]; a = charptr[3];
a = charptr[3]; if (a < 0xBF)
if (a < 0xBF) {
{ charptr[3]++;
charptr[3]++; break;
break; }
} /* FALL THRU */
charptr[3] = 0x80; case 3:
/* FALL THRU */ a = charptr[2];
case 3: if (a < 0xBF)
bak[2] = charptr[2]; {
a = charptr[2]; charptr[2]++;
if (a < 0xBF) break;
{ }
charptr[2]++; /* FALL THRU */
break; case 2:
} a = charptr[1];
charptr[2] = 0x80;
/* FALL THRU */
case 2:
bak[1] = charptr[1];
a = charptr[1];
switch (*charptr) switch (*charptr)
{ {
case 0xED: case 0xED:
...@@ -1430,147 +1423,126 @@ pg_utf8_increment(unsigned char *charptr, int length) ...@@ -1430,147 +1423,126 @@ pg_utf8_increment(unsigned char *charptr, int length)
} }
if (a < limit) if (a < limit)
{ {
charptr[1]++; charptr[1]++;
break; break;
} }
charptr[1] = 0x80; /* FALL THRU */
/* FALL THRU */ case 1:
case 1: a = *charptr;
bak[0] = *charptr; if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
a = *charptr; return false;
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4) charptr[0]++;
{ break;
/* Restore original string. */ }
memcpy(charptr, bak, length);
return false;
}
charptr[0]++;
break;
}
return true; return true;
} }
/* /*
* EUC-JP character increment function. * EUC-JP character incrementer function.
* *
* If the sequence starts with SS2(0x8e), it must be a two-byte sequence * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
* representing JIS X 0201 characters with the second byte ranges between * representing JIS X 0201 characters with the second byte ranging between
* 0xa1 and 0xde. We just increment the last byte if it's less than 0xde, * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
* and otherwise rewrite whole the sequence to 0xa1 0xa1. * and otherwise rewrite the whole sequence to 0xa1 0xa1.
* *
* If the sequence starts with SS3(0x8f), it must be a three-byte sequence * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
* which the last two bytes ranges between 0xa1 and 0xfe. The last byte * in which the last two bytes range between 0xa1 and 0xfe. The last byte
* is incremented, carrying overflow to the second-to-last byte. * is incremented if possible, otherwise the second-to-last byte.
* *
* If the sequence starts with the values other than the aboves and its MSB * If the sequence starts with a value other than the above and its MSB
* is set, it must be a two-byte sequence representing JIS X 0208 characters * is set, it must be a two-byte sequence representing JIS X 0208 characters
* with both bytes ranges between 0xa1 and 0xfe. The last byte is incremented, * with both bytes ranging between 0xa1 and 0xfe. The last byte is
* carrying overflow to the second-to-last byte. * incremented if possible, otherwise the second-to-last byte.
* *
* Otherwise the sequence is consists of single byte representing ASCII * Otherwise, the sequence is a single-byte ASCII character. It is
* characters. It is incremented up to 0x7f. * incremented up to 0x7f.
*
* Only three EUC-JP byte sequences shown below - which have no character
* allocated - make this function to fail in spite of its validity: 0x7f,
* 0xfe 0xfe, 0x8f 0xfe 0xfe.
*/ */
static bool static bool
pg_eucjp_increment(unsigned char *charptr, int length) pg_eucjp_increment(unsigned char *charptr, int length)
{ {
unsigned char bak[3]; unsigned char c1,
unsigned char c1, c2; c2;
signed int i; int i;
c1 = *charptr; c1 = *charptr;
switch (c1) switch (c1)
{ {
case SS2: /* JIS X 0201 */ case SS2: /* JIS X 0201 */
if (length != 2) if (length != 2)
return false; return false;
c2 = charptr[1]; c2 = charptr[1];
if (c2 > 0xde)
charptr[0] = charptr[1] = 0xa1;
else if (c2 < 0xa1)
charptr[1] = 0xa1;
else
charptr[1]++;
break; if (c2 >= 0xdf)
charptr[0] = charptr[1] = 0xa1;
else if (c2 < 0xa1)
charptr[1] = 0xa1;
else
charptr[1]++;
break;
case SS3: /* JIS X 0212 */ case SS3: /* JIS X 0212 */
if (length != 3) if (length != 3)
return false; return false;
for (i = 2; i > 0; i--) for (i = 2; i > 0; i--)
{ {
bak[i] = charptr[i]; c2 = charptr[i];
c2 = charptr[i]; if (c2 < 0xa1)
if (c2 < 0xa1) {
{ charptr[i] = 0xa1;
charptr[i] = 0xa1; return true;
return true; }
} else if (c2 < 0xfe)
else if (c2 < 0xfe) {
{ charptr[i]++;
charptr[i]++; return true;
break; }
} }
charptr[i] = 0xa1;
} /* Out of 3-byte code region */
return false;
if (i == 0) /* Out of 3-byte code region */
{ default:
charptr[1] = bak[1]; if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
charptr[2] = bak[2]; {
return false; if (length != 2)
} return false;
break;
for (i = 1; i >= 0; i--)
default: {
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */ c2 = charptr[i];
{ if (c2 < 0xa1)
if (length != 2) {
charptr[i] = 0xa1;
return true;
}
else if (c2 < 0xfe)
{
charptr[i]++;
return true;
}
}
/* Out of 2 byte code region */
return false;
}
else
{ /* ASCII, single byte */
if (c1 > 0x7e)
return false; return false;
(*charptr)++;
}
break;
}
for (i = 1 ; i >= 0 ; i--) /* i must be signed */ return true;
{
bak[i] = charptr[i];
c2 = charptr[i];
if (c2 < 0xa1)
{
charptr[i] = 0xa1;
return true;
}
else if (c2 < 0xfe)
{
charptr[i]++;
break;
}
charptr[i] = 0xa1;
}
if (i < 0) /* Out of 2 byte code region */
{
charptr[0] = bak[0];
charptr[1] = bak[1];
return false;
}
}
else
{ /* ASCII, single byte */
if (c1 > 0x7e)
return false;
(*charptr)++;
}
}
return true;
} }
#endif
#endif /* !FRONTEND */
/* /*
*------------------------------------------------------------------- *-------------------------------------------------------------------
...@@ -1697,19 +1669,23 @@ pg_database_encoding_max_length(void) ...@@ -1697,19 +1669,23 @@ pg_database_encoding_max_length(void)
} }
/* /*
* give the character incrementer for the encoding for the current database * get the character incrementer for the encoding for the current database
*/ */
mbcharacter_incrementer mbcharacter_incrementer
pg_database_encoding_character_incrementer(void) pg_database_encoding_character_incrementer(void)
{ {
/*
* Eventually it might be best to add a field to pg_wchar_table[],
* but for now we just use a switch.
*/
switch (GetDatabaseEncoding()) switch (GetDatabaseEncoding())
{ {
case PG_UTF8: case PG_UTF8:
return pg_utf8_increment; return pg_utf8_increment;
case PG_EUC_JP: case PG_EUC_JP:
return pg_eucjp_increment; return pg_eucjp_increment;
default: default:
return pg_generic_charinc; return pg_generic_charinc;
} }
...@@ -1908,4 +1884,4 @@ report_untranslatable_char(int src_encoding, int dest_encoding, ...@@ -1908,4 +1884,4 @@ report_untranslatable_char(int src_encoding, int dest_encoding,
pg_enc2name_tbl[dest_encoding].name))); pg_enc2name_tbl[dest_encoding].name)));
} }
#endif #endif /* !FRONTEND */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment