Commit b80e1063 authored by Heikki Linnakangas's avatar Heikki Linnakangas

Add mbverifystr() functions specific to each encoding.

This makes pg_verify_mbstr() function faster, by allowing more efficient
encoding-specific implementations. All the implementations included in
this commit are pretty naive, they just call the same encoding-specific
verifychar functions that were used previously, but that already gives a
performance boost because the tight character-at-a-time loop is simpler.

Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01@iki.fi
parent a3367aa3
......@@ -682,7 +682,7 @@ read_extension_script_file(const ExtensionControlFile *control,
src_encoding = control->encoding;
/* make sure that source string is valid in the expected encoding */
pg_verify_mbstr_len(src_encoding, src_str, len, false);
(void) pg_verify_mbstr(src_encoding, src_str, len, false);
/*
* Convert the encoding to the database encoding. read_whole_file
......
......@@ -653,7 +653,7 @@ LocalToUtf(const unsigned char *iso, int len,
continue;
}
l = pg_encoding_verifymb(encoding, (const char *) iso, len);
l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
if (l < 0)
break;
......
......@@ -87,7 +87,7 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len)
continue;
}
l = pg_encoding_verifymb(PG_EUC_JIS_2004, (const char *) euc, len);
l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len);
if (l < 0)
report_invalid_encoding(PG_EUC_JIS_2004,
......@@ -238,7 +238,7 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len
continue;
}
l = pg_encoding_verifymb(PG_SHIFT_JIS_2004, (const char *) sjis, len);
l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len);
if (l < 0 || l > len)
report_invalid_encoding(PG_SHIFT_JIS_2004,
......
......@@ -291,7 +291,7 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
if (l < 0)
report_invalid_encoding(PG_MULE_INTERNAL,
(const char *) mic, len);
......@@ -381,7 +381,7 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
if (l < 0)
report_invalid_encoding(PG_EUC_JP,
(const char *) euc, len);
......@@ -431,7 +431,7 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
if (l < 0)
report_invalid_encoding(PG_MULE_INTERNAL,
(const char *) mic, len);
......@@ -485,7 +485,7 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len);
if (l < 0)
report_invalid_encoding(PG_EUC_JP,
(const char *) euc, len);
......@@ -580,7 +580,7 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_SJIS, (const char *) sjis, len);
l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len);
if (l < 0)
report_invalid_encoding(PG_SJIS,
(const char *) sjis, len);
......
......@@ -76,7 +76,7 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
c1 = *euc;
if (IS_HIGHBIT_SET(c1))
{
l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len);
l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len);
if (l != 2)
report_invalid_encoding(PG_EUC_KR,
(const char *) euc, len);
......@@ -122,7 +122,7 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
if (l < 0)
report_invalid_encoding(PG_MULE_INTERNAL,
(const char *) mic, len);
......
......@@ -148,7 +148,7 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
c1 = *euc;
if (IS_HIGHBIT_SET(c1))
{
l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len);
l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
if (l < 0)
report_invalid_encoding(PG_EUC_TW,
(const char *) euc, len);
......@@ -213,7 +213,7 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
if (l < 0)
report_invalid_encoding(PG_MULE_INTERNAL,
(const char *) mic, len);
......@@ -272,7 +272,7 @@ big52mic(const unsigned char *big5, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len);
l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
if (l < 0)
report_invalid_encoding(PG_BIG5,
(const char *) big5, len);
......@@ -321,7 +321,7 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len)
len--;
continue;
}
l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len);
if (l < 0)
report_invalid_encoding(PG_MULE_INTERNAL,
(const char *) mic, len);
......
......@@ -519,7 +519,7 @@ pg_convert(PG_FUNCTION_ARGS)
/* make sure that source string is valid */
len = VARSIZE_ANY_EXHDR(string);
src_str = VARDATA_ANY(string);
pg_verify_mbstr_len(src_encoding, src_str, len, false);
(void) pg_verify_mbstr(src_encoding, src_str, len, false);
/* perform conversion */
dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
......@@ -1215,10 +1215,10 @@ static bool
pg_generic_charinc(unsigned char *charptr, int len)
{
unsigned char *lastbyte = charptr + len - 1;
mbverifier mbverify;
mbchar_verifier mbverify;
/* We can just invoke the character verifier directly. */
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
while (*lastbyte < (unsigned char) 255)
{
......@@ -1445,8 +1445,7 @@ pg_database_encoding_max_length(void)
bool
pg_verifymbstr(const char *mbstr, int len, bool noError)
{
return
pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
}
/*
......@@ -1456,7 +1455,18 @@ pg_verifymbstr(const char *mbstr, int len, bool noError)
bool
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
{
return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
int oklen;
Assert(PG_VALID_ENCODING(encoding));
oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
if (oklen != len)
{
if (noError)
return false;
report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
}
return true;
}
/*
......@@ -1469,11 +1479,14 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
* If OK, return length of string in the encoding.
* If a problem is found, return -1 when noError is
* true; when noError is false, ereport() a descriptive message.
*
* Note: We cannot use the faster encoding-specific mbverifystr() function
* here, because we need to count the number of characters in the string.
*/
int
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
{
mbverifier mbverify;
mbchar_verifier mbverifychar;
int mb_len;
Assert(PG_VALID_ENCODING(encoding));
......@@ -1493,7 +1506,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
}
/* fetch function pointer just once */
mbverify = pg_wchar_table[encoding].mbverify;
mbverifychar = pg_wchar_table[encoding].mbverifychar;
mb_len = 0;
......@@ -1516,7 +1529,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
report_invalid_encoding(encoding, mbstr, len);
}
l = (*mbverify) ((const unsigned char *) mbstr, len);
l = (*mbverifychar) ((const unsigned char *) mbstr, len);
if (l < 0)
{
......
This diff is collapsed.
......@@ -371,7 +371,9 @@ typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
typedef bool (*mbcharacter_incrementer) (unsigned char *mbstr, int len);
typedef int (*mbverifier) (const unsigned char *mbstr, int len);
typedef int (*mbchar_verifier) (const unsigned char *mbstr, int len);
typedef int (*mbstr_verifier) (const unsigned char *mbstr, int len);
typedef struct
{
......@@ -381,7 +383,8 @@ typedef struct
* to a multibyte */
mblen_converter mblen; /* get byte length of a char */
mbdisplaylen_converter dsplen; /* get display width of a char */
mbverifier mbverify; /* verify multibyte sequence */
mbchar_verifier mbverifychar; /* verify multibyte character */
mbstr_verifier mbverifystr; /* verify multibyte string */
int maxmblen; /* max bytes for a char in this encoding */
} pg_wchar_tbl;
......@@ -554,7 +557,8 @@ extern int pg_valid_server_encoding_id(int encoding);
*/
extern int pg_encoding_mblen(int encoding, const char *mbstr);
extern int pg_encoding_dsplen(int encoding, const char *mbstr);
extern int pg_encoding_verifymb(int encoding, const char *mbstr, int len);
extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);
extern int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len);
extern int pg_encoding_max_length(int encoding);
extern int pg_valid_client_encoding(const char *name);
extern int pg_valid_server_encoding(const char *name);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment