Commit 08e0b34b authored by Bruce Momjian's avatar Bruce Momjian

Back out fix for Unicode characters above 0x10000

parent 5d7a555d
<!-- <!--
$PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.65 2004/11/12 21:50:53 tgl Exp $ $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.66 2004/12/03 01:20:14 momjian Exp $
--> -->
<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.2//EN" [ <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.2//EN" [
...@@ -179,6 +179,7 @@ $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.65 2004/11/12 21:50:53 tgl Exp ...@@ -179,6 +179,7 @@ $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.65 2004/11/12 21:50:53 tgl Exp
&lobj; &lobj;
&ecpg; &ecpg;
&infoschema; &infoschema;
&external_projects;
</part> </part>
......
/* /*
* conversion functions between pg_wchar and multibyte streams. * conversion functions between pg_wchar and multibyte streams.
* Tatsuo Ishii * Tatsuo Ishii
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.39 2004/12/02 22:37:13 momjian Exp $ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.40 2004/12/03 01:20:20 momjian Exp $
* *
* WIN1250 client encoding updated by Pavel Behal * WIN1250 client encoding updated by Pavel Behal
* *
...@@ -343,31 +343,6 @@ pg_johab_dsplen(const unsigned char *s) ...@@ -343,31 +343,6 @@ pg_johab_dsplen(const unsigned char *s)
return (pg_euc_dsplen(s)); return (pg_euc_dsplen(s));
} }
bool isLegalUTF8(const UTF8 *source, int len) {
UTF8 a;
const UTF8 *srcptr = source+len;
if(!source || (pg_utf_mblen(source) != len)) return false;
switch (len) {
default: return false;
/* Everything else falls through when "true"... */
case 6: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 5: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
if (*source > 0xFD) return false;
}
return true;
}
/* /*
* convert UTF-8 string to pg_wchar (UCS-2) * convert UTF-8 string to pg_wchar (UCS-2)
* caller should allocate enough space for "to" * caller should allocate enough space for "to"
...@@ -423,7 +398,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) ...@@ -423,7 +398,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
* returns the byte length of a UTF-8 word pointed to by s * returns the byte length of a UTF-8 word pointed to by s
*/ */
int int
pg_utf_mblen(const UTF8 *s) pg_utf_mblen(const unsigned char *s)
{ {
int len = 1; int len = 1;
...@@ -431,19 +406,13 @@ pg_utf_mblen(const UTF8 *s) ...@@ -431,19 +406,13 @@ pg_utf_mblen(const UTF8 *s)
len = 1; len = 1;
else if ((*s & 0xe0) == 0xc0) else if ((*s & 0xe0) == 0xc0)
len = 2; len = 2;
else if ((*s & 0xf0) == 0xe0) else if ((*s & 0xe0) == 0xe0)
len = 3; len = 3;
else if ((*s & 0xf8) == 0xf0)
len = 4;
else if ((*s & 0xfc) == 0xf8)
len = 5;
else if ((*s & 0xfe) == 0xfc)
len = 6;
return (len); return (len);
} }
static int static int
pg_utf_dsplen(const UTF8 *s) pg_utf_dsplen(const unsigned char *s)
{ {
return 1; /* XXX fix me! */ return 1; /* XXX fix me! */
} }
...@@ -752,8 +721,8 @@ pg_wchar_tbl pg_wchar_table[] = { ...@@ -752,8 +721,8 @@ pg_wchar_tbl pg_wchar_table[] = {
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 6}, /* 6; PG_UNICODE */ {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UNICODE */
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 10; PG_LATIN3 */
...@@ -775,11 +744,11 @@ pg_wchar_tbl pg_wchar_table[] = { ...@@ -775,11 +744,11 @@ pg_wchar_tbl pg_wchar_table[] = {
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 26; ISO-8859-7 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 27; ISO-8859-8 */
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 28; PG_WIN1250 */
{0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */ {0, pg_sjis_mblen, pg_sjis_dsplen, 2}, /* 29; PG_SJIS */
{0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */ {0, pg_big5_mblen, pg_big5_dsplen, 2}, /* 30; PG_BIG5 */
{0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */ {0, pg_gbk_mblen, pg_gbk_dsplen, 2}, /* 31; PG_GBK */
{0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */ {0, pg_uhc_mblen, pg_uhc_dsplen, 2}, /* 32; PG_UHC */
{0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */ {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
}; };
/* returns the byte length of a word for mule internal code */ /* returns the byte length of a word for mule internal code */
...@@ -853,48 +822,51 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError) ...@@ -853,48 +822,51 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
while (len > 0 && *mbstr) while (len > 0 && *mbstr)
{ {
/* special UTF-8 check */
if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
{
if (noError)
return false;
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
}
l = pg_mblen(mbstr); l = pg_mblen(mbstr);
/* special UTF-8 check */ for (i = 1; i < l; i++)
if (encoding == PG_UTF8) { {
if(!isLegalUTF8(mbstr,l)) { /*
if (noError) return false; * we expect that every multibyte char consists of bytes
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near character %c",*mbstr))); * having the 8th bit set
} */
} else { if (i >= len || (mbstr[i] & 0x80) == 0)
for (i = 1; i < l; i++)
{ {
/* char buf[8 * 2 + 1];
* we expect that every multibyte char consists of bytes char *p = buf;
* having the 8th bit set int j,
*/
if (i >= len || (mbstr[i] & 0x80) == 0)
{
char buf[8 * 2 + 1];
char *p = buf;
int j,
jlimit; jlimit;
if (noError) if (noError)
return false; return false;
jlimit = Min(l, len); jlimit = Min(l, len);
jlimit = Min(jlimit, 8); /* prevent buffer overrun */ jlimit = Min(jlimit, 8); /* prevent buffer overrun */
for (j = 0; j < jlimit; j++) for (j = 0; j < jlimit; j++)
p += sprintf(p, "%02x", mbstr[j]); p += sprintf(p, "%02x", mbstr[j]);
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid byte sequence for encoding \"%s\": 0x%s", errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
GetDatabaseEncodingName(), buf))); GetDatabaseEncodingName(), buf)));
}
} }
} }
len -= l; len -= l;
mbstr += l; mbstr += l;
} }
return true; return true;
} }
......
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.54 2004/12/02 22:37:14 momjian Exp $ */ /* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.55 2004/12/03 01:20:33 momjian Exp $ */
#ifndef PG_WCHAR_H #ifndef PG_WCHAR_H
#define PG_WCHAR_H #define PG_WCHAR_H
...@@ -17,14 +17,6 @@ ...@@ -17,14 +17,6 @@
*/ */
typedef unsigned int pg_wchar; typedef unsigned int pg_wchar;
/*
* The UTF types
*/
typedef unsigned int UTF32; /* at least 32 bits */
typedef unsigned short UTF16; /* at least 16 bits */
typedef unsigned char UTF8; /* typically 8 bits */
/* /*
* various definitions for EUC * various definitions for EUC
*/ */
...@@ -348,6 +340,4 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc); ...@@ -348,6 +340,4 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
extern bool isLegalUTF8(const UTF8 *source, int len);
#endif /* PG_WCHAR_H */ #endif /* PG_WCHAR_H */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment