Commit ced3a93c authored by Tom Lane's avatar Tom Lane

Fix assorted bugs in contrib/unaccent's configuration file parsing.

Make it use t_isspace() to identify whitespace, rather than relying on
sscanf which is known to get it wrong on some platform/locale combinations.
Get rid of fixed-size buffers.  Make it actually continue to parse the file
after ignoring a line with untranslatable characters, as was obviously
intended.

The first of these issues is per gripe from J Smith, though not exactly
either of his proposed patches.
parent ffc703a8
...@@ -91,35 +91,83 @@ initSuffixTree(char *filename) ...@@ -91,35 +91,83 @@ initSuffixTree(char *filename)
do do
{ {
char src[4096]; /*
char trg[4096]; * pg_do_encoding_conversion() (called by tsearch_readline()) will
int srclen; * emit exception if it finds untranslatable characters in current
int trglen; * locale. We just skip such lines, continuing with the next.
char *line = NULL; */
skip = true; skip = true;
PG_TRY(); PG_TRY();
{
char *line;
while ((line = tsearch_readline(&trst)) != NULL)
{ {
/* /*
* pg_do_encoding_conversion() (called by tsearch_readline()) will * The format of each line must be "src trg" where src and trg
* emit exception if it finds untranslatable characters in current * are sequences of one or more non-whitespace characters,
* locale. We just skip such characters. * separated by whitespace. Whitespace at start or end of
* line is ignored.
*/ */
while ((line = tsearch_readline(&trst)) != NULL) int state;
char *ptr;
char *src = NULL;
char *trg = NULL;
int ptrlen;
int srclen = 0;
int trglen = 0;
state = 0;
for (ptr = line; *ptr; ptr += ptrlen)
{
ptrlen = pg_mblen(ptr);
/* ignore whitespace, but end src or trg */
if (t_isspace(ptr))
{ {
if (sscanf(line, "%s\t%s\n", src, trg) != 2) if (state == 1)
state = 2;
else if (state == 3)
state = 4;
continue; continue;
}
switch (state)
{
case 0:
/* start of src */
src = ptr;
srclen = ptrlen;
state = 1;
break;
case 1:
/* continue src */
srclen += ptrlen;
break;
case 2:
/* start of trg */
trg = ptr;
trglen = ptrlen;
state = 3;
break;
case 3:
/* continue trg */
trglen += ptrlen;
break;
default:
/* bogus line format */
state = -1;
break;
}
}
srclen = strlen(src); if (state >= 3)
trglen = strlen(trg);
rootSuffixTree = placeChar(rootSuffixTree, rootSuffixTree = placeChar(rootSuffixTree,
(unsigned char *) src, srclen, (unsigned char *) src, srclen,
trg, trglen); trg, trglen);
skip = false;
pfree(line); pfree(line);
} }
skip = false;
} }
PG_CATCH(); PG_CATCH();
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment