Commit f2a01b0d authored by Teodor Sigaev's avatar Teodor Sigaev

Fix localization support for multibyte encoding and C locale.

Slightly reworked patch from Tatsuo Ishii
parent 7021d6f6
...@@ -12,13 +12,13 @@ ...@@ -12,13 +12,13 @@
size_t size_t
wchar2char(char *to, const wchar_t *from, size_t len) wchar2char(char *to, const wchar_t *from, size_t len)
{ {
if (len == 0)
return 0;
if (GetDatabaseEncoding() == PG_UTF8) if (GetDatabaseEncoding() == PG_UTF8)
{ {
int r; int r;
if (len == 0)
return 0;
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len, r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL); NULL, NULL);
...@@ -34,17 +34,19 @@ wchar2char(char *to, const wchar_t *from, size_t len) ...@@ -34,17 +34,19 @@ wchar2char(char *to, const wchar_t *from, size_t len)
return wcstombs(to, from, len); return wcstombs(to, from, len);
} }
#endif /* WIN32 */
size_t size_t
char2wchar(wchar_t *to, const char *from, size_t len) char2wchar(wchar_t *to, const char *from, size_t len)
{ {
if (len == 0)
return 0;
#ifdef WIN32
if (GetDatabaseEncoding() == PG_UTF8) if (GetDatabaseEncoding() == PG_UTF8)
{ {
int r; int r;
if (len == 0)
return 0;
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r) if (!r)
...@@ -60,29 +62,44 @@ char2wchar(wchar_t *to, const char *from, size_t len) ...@@ -60,29 +62,44 @@ char2wchar(wchar_t *to, const char *from, size_t len)
return r; return r;
} }
else
#endif /* WIN32 */
if ( lc_ctype_is_c() )
{
/*
* pg_mb2wchar_with_len always adds trailing '\0', so
* 'to' should be allocated with sufficient space
*/
return pg_mb2wchar_with_len(from, (pg_wchar *)to, len);
}
return mbstowcs(to, from, len); return mbstowcs(to, from, len);
} }
#endif /* WIN32 */
int int
_t_isalpha(const char *ptr) _t_isalpha(const char *ptr)
{ {
wchar_t character; wchar_t character[2];
if (lc_ctype_is_c())
return isalpha(TOUCHAR(ptr));
char2wchar(&character, ptr, 1); char2wchar(character, ptr, 1);
return iswalpha((wint_t) character); return iswalpha((wint_t) *character);
} }
int int
_t_isprint(const char *ptr) _t_isprint(const char *ptr)
{ {
wchar_t character; wchar_t character[2];
if (lc_ctype_is_c())
return isprint(TOUCHAR(ptr));
char2wchar(&character, ptr, 1); char2wchar(character, ptr, 1);
return iswprint((wint_t) character); return iswprint((wint_t) *character);
} }
#endif /* TS_USE_WIDE */ #endif /* TS_USE_WIDE */
...@@ -126,7 +143,7 @@ lowerstr(char *str) ...@@ -126,7 +143,7 @@ lowerstr(char *str)
if ( wlen < 0 ) if ( wlen < 0 )
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("transalation failed from server encoding to wchar_t"))); errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen<=len); Assert(wlen<=len);
wstr[wlen] = 0; wstr[wlen] = 0;
...@@ -152,7 +169,7 @@ lowerstr(char *str) ...@@ -152,7 +169,7 @@ lowerstr(char *str)
if ( wlen < 0 ) if ( wlen < 0 )
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("transalation failed from wchar_t to server encoding %d", errno))); errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen<=len); Assert(wlen<=len);
out[wlen]='\0'; out[wlen]='\0';
} }
......
...@@ -30,16 +30,17 @@ ...@@ -30,16 +30,17 @@
#define TOUCHAR(x) (*((unsigned char*)(x))) #define TOUCHAR(x) (*((unsigned char*)(x)))
#ifdef TS_USE_WIDE #ifdef TS_USE_WIDE
size_t char2wchar(wchar_t *to, const char *from, size_t len);
#ifdef WIN32 #ifdef WIN32
size_t wchar2char(char *to, const wchar_t *from, size_t len); size_t wchar2char(char *to, const wchar_t *from, size_t len);
size_t char2wchar(wchar_t *to, const char *from, size_t len);
#else /* WIN32 */ #else /* WIN32 */
/* correct mbstowcs */ /* correct wcstombs */
#define char2wchar mbstowcs
#define wchar2char wcstombs #define wchar2char wcstombs
#endif /* WIN32 */ #endif /* WIN32 */
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) ) #define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
...@@ -55,10 +56,10 @@ extern int _t_isprint(const char *ptr); ...@@ -55,10 +56,10 @@ extern int _t_isprint(const char *ptr);
*/ */
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) #define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
#define COPYCHAR(d,s) do { \ #define COPYCHAR(d,s) do { \
int lll = pg_mblen( s ); \ int lll = pg_mblen( s ); \
\ \
while( lll-- ) \ while( lll-- ) \
TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \ TOUCHAR((d)+lll) = TOUCHAR((s)+lll); \
} while(0) } while(0)
......
/* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.11 2006/10/04 00:29:47 momjian Exp $ */ /* $PostgreSQL: pgsql/contrib/tsearch2/wordparser/parser.c,v 1.12 2007/01/15 15:16:28 teodor Exp $ */
#include "postgres.h" #include "postgres.h"
...@@ -40,16 +40,13 @@ TParserInit(char *str, int len) ...@@ -40,16 +40,13 @@ TParserInit(char *str, int len)
#ifdef TS_USE_WIDE #ifdef TS_USE_WIDE
/* /*
* Use wide char code only when max encoding length > 1 and ctype != C. * Use wide char code only when max encoding length > 1.
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/ */
if (prs->charmaxlen > 1 && !lc_ctype_is_c()) if (prs->charmaxlen > 1)
{ {
prs->usewide = true; prs->usewide = true;
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * prs->lenstr); prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr+1));
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr); prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
} }
else else
...@@ -83,25 +80,99 @@ TParserClose(TParser * prs) ...@@ -83,25 +80,99 @@ TParserClose(TParser * prs)
/* /*
* defining support function, equvalent is* macroses, but * defining support function, equvalent is* macroses, but
* working with any possible encodings and locales * working with any possible encodings and locales. Note,
* that with multibyte encoding and C-locale isw* function may fail
* or give wrong result. Note 2: multibyte encoding and C-locale
* often are used for Asian languages.
*/ */
#ifdef TS_USE_WIDE #ifdef TS_USE_WIDE
#define p_iswhat(type) \ #define p_iswhat(type) \
static int \ static int \
p_is##type(TParser *prs) { \ p_is##type(TParser *prs) { \
Assert( prs->state ); \ Assert( prs->state ); \
return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ if ( prs->usewide ) \
is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ { \
} \ if ( lc_ctype_is_c() ) \
\ return is##type( 0xff & *( prs->wstr + prs->state->poschar) ); \
static int \ \
p_isnot##type(TParser *prs) { \ return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) ); \
return !p_is##type(prs); \ } \
\
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
} \
\
static int \
p_isnot##type(TParser *prs) { \
return !p_is##type(prs); \
} }
static int
p_isalnum(TParser *prs)
{
Assert( prs->state );
if (prs->usewide)
{
if (lc_ctype_is_c())
{
unsigned int c = *(unsigned int*)(prs->wstr + prs->state->poschar);
/*
* any non-ascii symbol with multibyte encoding
* with C-locale is an alpha character
*/
if ( c > 0x7f )
return 1;
return isalnum(0xff & c);
}
return iswalnum( (wint_t)*( prs->wstr + prs->state->poschar));
}
return isalnum( *(unsigned char*)( prs->str + prs->state->posbyte ));
}
static int
p_isnotalnum(TParser *prs)
{
return !p_isalnum(prs);
}
static int
p_isalpha(TParser *prs)
{
Assert( prs->state );
if (prs->usewide)
{
if (lc_ctype_is_c())
{
unsigned int c = *(prs->wstr + prs->state->poschar);
/*
* any non-ascii symbol with multibyte encoding
* with C-locale is an alpha character
*/
if ( c > 0x7f )
return 1;
return isalpha(0xff & c);
}
return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar));
}
return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte ));
}
static int
p_isnotalpha(TParser *prs)
{
return !p_isalpha(prs);
}
/* p_iseq should be used only for ascii symbols */ /* p_iseq should be used only for ascii symbols */
...@@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c) ...@@ -111,18 +182,19 @@ p_iseq(TParser * prs, char c)
Assert(prs->state); Assert(prs->state);
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0; return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
} }
#else /* TS_USE_WIDE */ #else /* TS_USE_WIDE */
#define p_iswhat(type) \ #define p_iswhat(type) \
static int \ static int \
p_is##type(TParser *prs) { \ p_is##type(TParser *prs) { \
Assert( prs->state ); \ Assert( prs->state ); \
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \ return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
} \ } \
\ \
static int \ static int \
p_isnot##type(TParser *prs) { \ p_isnot##type(TParser *prs) { \
return !p_is##type(prs); \ return !p_is##type(prs); \
} }
...@@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c) ...@@ -132,10 +204,12 @@ p_iseq(TParser * prs, char c)
Assert(prs->state); Assert(prs->state);
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0; return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
} }
#endif /* TS_USE_WIDE */
p_iswhat(alnum) p_iswhat(alnum)
p_iswhat(alpha) p_iswhat(alpha)
#endif /* TS_USE_WIDE */
p_iswhat(digit) p_iswhat(digit)
p_iswhat(lower) p_iswhat(lower)
p_iswhat(print) p_iswhat(print)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment