Commit 419fe7cd authored by Teodor Sigaev's avatar Teodor Sigaev

Fix bug http://archives.postgresql.org/pgsql-bugs/2006-10/msg00258.php.

Fix string's length calculation for recoding, fix strlower() to avoid wrong
assumption about length of recoded string (was: recoded string is no greater
that source, it may not true for multibyte encodings)
Thanks to Thomas H. <me@alternize.com> and Magnus Hagander <mha@sollentuna.net>
parent 1a5c450f
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.8 2006/03/11 04:38:30 momjian Exp $ */ /* $PostgreSQL: pgsql/contrib/tsearch2/dict_ex.c,v 1.9 2006/11/20 14:03:30 teodor Exp $ */
/* /*
* example of dictionary * example of dictionary
...@@ -52,9 +52,11 @@ dex_lexize(PG_FUNCTION_ARGS) ...@@ -52,9 +52,11 @@ dex_lexize(PG_FUNCTION_ARGS)
{ {
DictExample *d = (DictExample *) PG_GETARG_POINTER(0); DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1); char *in = (char *) PG_GETARG_POINTER(1);
char *txt = pnstrdup(in, PG_GETARG_INT32(2)); char *utxt = pnstrdup(in, PG_GETARG_INT32(2));
TSLexeme *res = palloc(sizeof(TSLexeme) * 2); TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
char *txt = lowerstr(utxt);
pfree(utxt);
memset(res, 0, sizeof(TSLexeme) * 2); memset(res, 0, sizeof(TSLexeme) * 2);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
......
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.12 2006/07/11 16:35:31 momjian Exp $ */ /* $PostgreSQL: pgsql/contrib/tsearch2/dict_snowball.c,v 1.13 2006/11/20 14:03:30 teodor Exp $ */
/* /*
* example of Snowball dictionary * example of Snowball dictionary
...@@ -142,9 +142,11 @@ snb_lexize(PG_FUNCTION_ARGS) ...@@ -142,9 +142,11 @@ snb_lexize(PG_FUNCTION_ARGS)
{ {
DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0); DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1); char *in = (char *) PG_GETARG_POINTER(1);
char *txt = pnstrdup(in, PG_GETARG_INT32(2)); char *utxt = pnstrdup(in, PG_GETARG_INT32(2));
TSLexeme *res = palloc(sizeof(TSLexeme) * 2); TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
char *txt = lowerstr(utxt);
pfree(utxt);
memset(res, 0, sizeof(TSLexeme) * 2); memset(res, 0, sizeof(TSLexeme) * 2);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{ {
......
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.9 2006/03/11 04:38:30 momjian Exp $ */ /* $PostgreSQL: pgsql/contrib/tsearch2/dict_syn.c,v 1.10 2006/11/20 14:03:30 teodor Exp $ */
/* /*
* ISpell interface * ISpell interface
...@@ -132,8 +132,8 @@ syn_init(PG_FUNCTION_ARGS) ...@@ -132,8 +132,8 @@ syn_init(PG_FUNCTION_ARGS)
continue; continue;
*end = '\0'; *end = '\0';
d->syn[cur].in = strdup(lowerstr(starti)); d->syn[cur].in = lowerstr(starti);
d->syn[cur].out = strdup(lowerstr(starto)); d->syn[cur].out = lowerstr(starto);
if (!(d->syn[cur].in && d->syn[cur].out)) if (!(d->syn[cur].in && d->syn[cur].out))
{ {
fclose(fin); fclose(fin);
...@@ -163,12 +163,15 @@ syn_lexize(PG_FUNCTION_ARGS) ...@@ -163,12 +163,15 @@ syn_lexize(PG_FUNCTION_ARGS)
Syn key, Syn key,
*found; *found;
TSLexeme *res = NULL; TSLexeme *res = NULL;
char *wrd;
if (!PG_GETARG_INT32(2)) if (!PG_GETARG_INT32(2))
PG_RETURN_POINTER(NULL); PG_RETURN_POINTER(NULL);
key.out = NULL; key.out = NULL;
key.in = lowerstr(pnstrdup(in, PG_GETARG_INT32(2))); wrd = pnstrdup(in, PG_GETARG_INT32(2));
key.in = lowerstr(wrd);
pfree(wrd);
found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn); found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
pfree(key.in); pfree(key.in);
......
...@@ -147,7 +147,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag) ...@@ -147,7 +147,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
int int
NIImportDictionary(IspellDict * Conf, const char *filename) NIImportDictionary(IspellDict * Conf, const char *filename)
{ {
char str[BUFSIZ]; char str[BUFSIZ], *pstr;
FILE *dict; FILE *dict;
if (!(dict = fopen(filename, "r"))) if (!(dict = fopen(filename, "r")))
...@@ -190,9 +190,10 @@ NIImportDictionary(IspellDict * Conf, const char *filename) ...@@ -190,9 +190,10 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
} }
s += pg_mblen(s); s += pg_mblen(s);
} }
lowerstr(str); pstr = lowerstr(str);
NIAddSpell(Conf, str, flag); NIAddSpell(Conf, pstr, flag);
pfree(pstr);
} }
fclose(dict); fclose(dict);
return (0); return (0);
...@@ -418,8 +419,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, int line) ...@@ -418,8 +419,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, int line)
int int
NIImportAffixes(IspellDict * Conf, const char *filename) NIImportAffixes(IspellDict * Conf, const char *filename)
{ {
char str[BUFSIZ]; char str[BUFSIZ], *pstr = NULL;
char tmpstr[BUFSIZ];
char mask[BUFSIZ]; char mask[BUFSIZ];
char find[BUFSIZ]; char find[BUFSIZ];
char repl[BUFSIZ]; char repl[BUFSIZ];
...@@ -439,11 +439,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ...@@ -439,11 +439,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
while (fgets(str, sizeof(str), affix)) while (fgets(str, sizeof(str), affix))
{ {
line++; line++;
if ( *str == '#' || *str == '\n' )
continue;
pg_verifymbstr(str, strlen(str), false); pg_verifymbstr(str, strlen(str), false);
memcpy(tmpstr, str, 32); /* compoundwords... */ if ( pstr )
tmpstr[32] = '\0'; pfree( pstr );
lowerstr(tmpstr); pstr = lowerstr(str);
if (STRNCMP(tmpstr, "compoundwords") == 0) if (STRNCMP(pstr, "compoundwords") == 0)
{ {
s = findchar(str, 'l'); s = findchar(str, 'l');
if (s) if (s)
...@@ -458,21 +461,21 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ...@@ -458,21 +461,21 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
continue; continue;
} }
} }
if (STRNCMP(tmpstr, "suffixes") == 0) if (STRNCMP(pstr, "suffixes") == 0)
{ {
suffixes = 1; suffixes = 1;
prefixes = 0; prefixes = 0;
oldformat++; oldformat++;
continue; continue;
} }
if (STRNCMP(tmpstr, "prefixes") == 0) if (STRNCMP(pstr, "prefixes") == 0)
{ {
suffixes = 0; suffixes = 0;
prefixes = 1; prefixes = 1;
oldformat++; oldformat++;
continue; continue;
} }
if (STRNCMP(tmpstr, "flag") == 0) if (STRNCMP(pstr, "flag") == 0)
{ {
s = str + 4; s = str + 4;
flagflags = 0; flagflags = 0;
...@@ -523,14 +526,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ...@@ -523,14 +526,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
if ((!suffixes) && (!prefixes)) if ((!suffixes) && (!prefixes))
continue; continue;
lowerstr(str); if (!parse_affentry(pstr, mask, find, repl, line))
if (!parse_affentry(str, mask, find, repl, line))
continue; continue;
NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
} }
fclose(affix); fclose(affix);
if ( pstr )
pfree( pstr );
return (0); return (0);
} }
...@@ -538,11 +543,11 @@ int ...@@ -538,11 +543,11 @@ int
NIImportOOAffixes(IspellDict * Conf, const char *filename) NIImportOOAffixes(IspellDict * Conf, const char *filename)
{ {
char str[BUFSIZ]; char str[BUFSIZ];
char type[BUFSIZ]; char type[BUFSIZ], *ptype = NULL;
char sflag[BUFSIZ]; char sflag[BUFSIZ];
char mask[BUFSIZ]; char mask[BUFSIZ], *pmask;
char find[BUFSIZ]; char find[BUFSIZ], *pfind;
char repl[BUFSIZ]; char repl[BUFSIZ], *prepl;
bool isSuffix = false; bool isSuffix = false;
int flag = 0; int flag = 0;
char flagflags = 0; char flagflags = 0;
...@@ -577,8 +582,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) ...@@ -577,8 +582,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
scanread = sscanf(str, scanbuf, type, sflag, find, repl, mask); scanread = sscanf(str, scanbuf, type, sflag, find, repl, mask);
lowerstr(type); if (ptype)
if (scanread < 4 || (STRNCMP(type, "sfx") && STRNCMP(type, "pfx"))) pfree(ptype);
ptype = lowerstr(type);
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
continue; continue;
if (scanread == 4) if (scanread == 4)
...@@ -586,29 +593,35 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) ...@@ -586,29 +593,35 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename)
if (strlen(sflag) != 1) if (strlen(sflag) != 1)
continue; continue;
flag = *sflag; flag = *sflag;
isSuffix = (STRNCMP(type, "sfx") == 0) ? true : false; isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
lowerstr(find); pfind = lowerstr(find);
if (t_iseq(find, 'y')) if (t_iseq(find, 'y'))
flagflags |= FF_CROSSPRODUCT; flagflags |= FF_CROSSPRODUCT;
else else
flagflags = 0; flagflags = 0;
pfree(pfind);
} }
else else
{ {
if (strlen(sflag) != 1 || flag != *sflag || flag == 0) if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
continue; continue;
lowerstr(repl); prepl = lowerstr(repl);
lowerstr(find); pfind = lowerstr(find);
lowerstr(mask); pmask = lowerstr(mask);
if (t_iseq(find, '0')) if (t_iseq(find, '0'))
*find = '\0'; *find = '\0';
if (t_iseq(repl, '0')) if (t_iseq(repl, '0'))
*repl = '\0'; *repl = '\0';
NIAddAffix(Conf, flag, flagflags, mask, find, repl, isSuffix ? FF_SUFFIX : FF_PREFIX); NIAddAffix(Conf, flag, flagflags, mask, find, repl, isSuffix ? FF_SUFFIX : FF_PREFIX);
pfree(prepl);
pfree(pfind);
pfree(pmask);
} }
} }
if (ptype)
pfree(ptype);
fclose(affix); fclose(affix);
return 0; return 0;
...@@ -1053,7 +1066,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) ...@@ -1053,7 +1066,6 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
if (wrdlen > MAXNORMLEN) if (wrdlen > MAXNORMLEN)
return NULL; return NULL;
lowerstr(word);
cur = forms = (char **) palloc(MAX_NORM * sizeof(char *)); cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
*cur = NULL; *cur = NULL;
...@@ -1354,13 +1366,17 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, ...@@ -1354,13 +1366,17 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
} }
TSLexeme * TSLexeme *
NINormalizeWord(IspellDict * Conf, char *word) NINormalizeWord(IspellDict * Conf, char *uword)
{ {
char **res = NormalizeSubWord(Conf, word, 0); char **res;
char *word;
TSLexeme *lcur = NULL, TSLexeme *lcur = NULL,
*lres = NULL; *lres = NULL;
uint16 NVariant = 1; uint16 NVariant = 1;
word = lowerstr(uword);
res = NormalizeSubWord(Conf, word, 0);
if (res) if (res)
{ {
char **ptr = res; char **ptr = res;
...@@ -1431,6 +1447,9 @@ NINormalizeWord(IspellDict * Conf, char *word) ...@@ -1431,6 +1447,9 @@ NINormalizeWord(IspellDict * Conf, char *word)
var = ptr; var = ptr;
} }
} }
pfree(word);
return lres; return lres;
} }
......
...@@ -36,7 +36,7 @@ readstoplist(text *in, StopList * s) ...@@ -36,7 +36,7 @@ readstoplist(text *in, StopList * s)
{ {
char *filename = to_absfilename(text2char(in)); char *filename = to_absfilename(text2char(in));
FILE *hin; FILE *hin;
char buf[STOPBUFLEN]; char buf[STOPBUFLEN], *pbuf;
int reallen = 0; int reallen = 0;
if ((hin = fopen(filename, "r")) == NULL) if ((hin = fopen(filename, "r")) == NULL)
...@@ -49,7 +49,6 @@ readstoplist(text *in, StopList * s) ...@@ -49,7 +49,6 @@ readstoplist(text *in, StopList * s)
{ {
buf[strlen(buf) - 1] = '\0'; buf[strlen(buf) - 1] = '\0';
pg_verifymbstr(buf, strlen(buf), false); pg_verifymbstr(buf, strlen(buf), false);
lowerstr(buf);
if (*buf == '\0') if (*buf == '\0')
continue; continue;
...@@ -70,7 +69,14 @@ readstoplist(text *in, StopList * s) ...@@ -70,7 +69,14 @@ readstoplist(text *in, StopList * s)
stop = tmp; stop = tmp;
} }
stop[s->len] = strdup(buf); if (s->wordop)
{
pbuf = s->wordop(buf);
stop[s->len] = strdup(pbuf);
pfree(pbuf);
} else
stop[s->len] = strdup(buf);
if (!stop[s->len]) if (!stop[s->len])
{ {
freestoplist(s); freestoplist(s);
...@@ -79,8 +85,6 @@ readstoplist(text *in, StopList * s) ...@@ -79,8 +85,6 @@ readstoplist(text *in, StopList * s)
(errcode(ERRCODE_OUT_OF_MEMORY), (errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory"))); errmsg("out of memory")));
} }
if (s->wordop)
stop[s->len] = (s->wordop) (stop[s->len]);
(s->len)++; (s->len)++;
} }
...@@ -106,7 +110,5 @@ sortstoplist(StopList * s) ...@@ -106,7 +110,5 @@ sortstoplist(StopList * s)
bool bool
searchstoplist(StopList * s, char *key) searchstoplist(StopList * s, char *key)
{ {
if (s->wordop)
key = (*(s->wordop)) (key);
return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? true : false; return (s->stop && s->len > 0 && bsearch(&key, s->stop, s->len, sizeof(char *), comparestr)) ? true : false;
} }
...@@ -14,21 +14,12 @@ wchar2char(char *to, const wchar_t *from, size_t len) ...@@ -14,21 +14,12 @@ wchar2char(char *to, const wchar_t *from, size_t len)
{ {
if (GetDatabaseEncoding() == PG_UTF8) if (GetDatabaseEncoding() == PG_UTF8)
{ {
int r, int r;
nbytes;
if (len == 0) if (len == 0)
return 0; return 0;
/* in any case, *to should be allocated with enough space */ r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
nbytes = WideCharToMultiByte(CP_UTF8, 0, from, len, NULL, 0, NULL, NULL);
if (nbytes == 0)
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("UTF-16 to UTF-8 translation failed: %lu",
GetLastError())));
r = WideCharToMultiByte(CP_UTF8, 0, from, len, to, nbytes,
NULL, NULL); NULL, NULL);
if (r == 0) if (r == 0)
...@@ -36,6 +27,8 @@ wchar2char(char *to, const wchar_t *from, size_t len) ...@@ -36,6 +27,8 @@ wchar2char(char *to, const wchar_t *from, size_t len)
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("UTF-16 to UTF-8 translation failed: %lu", errmsg("UTF-16 to UTF-8 translation failed: %lu",
GetLastError()))); GetLastError())));
Assert(r <= len);
return r; return r;
} }
...@@ -56,7 +49,7 @@ char2wchar(wchar_t *to, const char *from, size_t len) ...@@ -56,7 +49,7 @@ char2wchar(wchar_t *to, const char *from, size_t len)
if (!r) if (!r)
{ {
pg_verifymbstr(from, len, false); pg_verifymbstr(from, strlen(from), false);
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"), errmsg("invalid multibyte character for locale"),
...@@ -97,6 +90,11 @@ char * ...@@ -97,6 +90,11 @@ char *
lowerstr(char *str) lowerstr(char *str)
{ {
char *ptr = str; char *ptr = str;
char *out;
int len = strlen(str);
if ( len == 0 )
return pstrdup("");
#ifdef TS_USE_WIDE #ifdef TS_USE_WIDE
...@@ -110,24 +108,67 @@ lowerstr(char *str) ...@@ -110,24 +108,67 @@ lowerstr(char *str)
{ {
wchar_t *wstr, wchar_t *wstr,
*wptr; *wptr;
int len = strlen(str); int wlen;
/*
*alloc number of wchar_t for worst case, len contains
* number of bytes <= number of characters and
* alloc 1 wchar_t for 0, because wchar2char(wcstombs in really)
* wants zero-terminated string
*/
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
/*
* str SHOULD be cstring, so wlen contains number
* of converted character
*/
wlen = char2wchar(wstr, str, len);
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("transalation failed from server encoding to wchar_t")));
Assert(wlen<=len);
wstr[wlen] = 0;
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
char2wchar(wstr, str, len + 1);
while (*wptr) while (*wptr)
{ {
*wptr = towlower((wint_t) *wptr); *wptr = towlower((wint_t) *wptr);
wptr++; wptr++;
} }
wchar2char(str, wstr, len);
/*
* Alloc result string for worst case + '\0'
*/
len = sizeof(char)*pg_database_encoding_max_length()*(wlen+1);
out = (char*)palloc(len);
/*
* wlen now is number of bytes which is always >= number of characters
*/
wlen = wchar2char(out, wstr, len);
pfree(wstr); pfree(wstr);
if ( wlen < 0 )
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("transalation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len);
out[wlen]='\0';
} }
else else
#endif #endif
{
char *outptr;
outptr = out = (char*)palloc( sizeof(char) * (len+1) );
while (*ptr) while (*ptr)
{ {
*ptr = tolower(*(unsigned char *) ptr); *outptr++ = tolower(*(unsigned char *) ptr);
ptr++; ptr++;
} }
return str; *outptr = '\0';
}
return out;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment