Commit de53ce81 authored by Tatsuo Ishii's avatar Tatsuo Ishii

Support for conversion between UNICODE and other encodings

currently ISO8859-[1-5] and EUC_JP are supported.
support for other encodings will be coming soon.
parent 6619ad11
This diff is collapsed.
......@@ -4,7 +4,7 @@
# Makefile for utils/mb
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/utils/mb/Makefile,v 1.10 2000/08/31 16:10:56 petere Exp $
# $Header: /cvsroot/pgsql/src/backend/utils/mb/Makefile,v 1.11 2000/10/12 06:06:49 ishii Exp $
#
#-------------------------------------------------------------------------
......@@ -29,6 +29,15 @@ sjistest.o: sjistest.c
liketest.o: liketest.c
$(CC) -c $(CFLAGS) liketest.c
uconv.o: uconv.c
$(CC) -c $(CFLAGS) uconv.c
uconv2.o: uconv2.c
$(CC) -c $(CFLAGS) uconv2.c
utftest.o: utftest.c conv.c wchar.c mbutils.c
$(CC) -c $(CFLAGS) utftest.c
sjistest: $(OBJS) sjistest.o palloc.o
$(CC) -o sjistest sjistest.o palloc.o \
common.o mbutils.o wchar.o wstrcmp.o wstrncmp.o variable.o \
......@@ -39,6 +48,21 @@ liketest: $(OBJS) liketest.o palloc.o
common.o mbutils.o wchar.o wstrcmp.o wstrncmp.o variable.o \
big5.o $(LDFLAGS)
utftest: $(OBJS) utftest.o palloc.o
$(CC) -o utftest utftest.o palloc.o \
common.o wstrcmp.o wstrncmp.o variable.o \
big5.o $(LDFLAGS)
uconv: uconv.o palloc.o
$(CC) -o uconv uconv.o palloc.o \
common.o conv.o wchar.o \
big5.o mbutils.o $(LDFLAGS)
uconv2: uconv2.o palloc.o
$(CC) -o uconv2 uconv2.o palloc.o \
common.o conv.o wchar.o \
big5.o mbutils.o $(LDFLAGS)
depend dep:
$(CC) -MM $(CFLAGS) *.c >depend
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include <stdio.h>
#include <string.h>
#include "mb/pg_wchar.h"
#define LIKE_FALSE 0
#define LIKE_TRUE 1
#define LIKE_ABORT 2
#define PG_CHAR unsigned char
#define UCHARMAX 0xff
/*----------------------------------------------------------------*/
static int wchareq(unsigned char *p1, unsigned char *p2)
{
int l;
l = pg_mblen(p1);
if (pg_mblen(p2) != l) {
return(0);
}
while (l--) {
if (*p1++ != *p2++)
return(0);
}
return(1);
}
static int iwchareq(unsigned char *p1, unsigned char *p2)
{
int c1, c2;
int l;
/* short cut. if *p1 and *p2 is lower than UCHARMAX, then
we assume they are ASCII */
if (*p1 < UCHARMAX && *p2 < UCHARMAX)
return(tolower(*p1) == tolower(*p2));
if (*p1 < UCHARMAX)
c1 = tolower(*p1);
else
{
l = pg_mblen(p1);
(void)pg_mb2wchar_with_len(p1, (pg_wchar *)&c1, l);
c1 = tolower(c1);
}
if (*p2 < UCHARMAX)
c2 = tolower(*p2);
else
{
l = pg_mblen(p2);
(void)pg_mb2wchar_with_len(p2, (pg_wchar *)&c2, l);
c2 = tolower(c2);
}
return(c1 == c2);
}
#ifdef MULTIBYTE
#define CHAREQ(p1, p2) wchareq(p1, p2)
#define ICHAREQ(p1, p2) iwchareq(p1, p2)
#define NextChar(p, plen) {int __l = pg_mblen(p); (p) +=__l; (plen) -=__l;}
#else
#define CHAREQ(p1, p2) (*(p1) == *(p2))
#define ICHAREQ(p1, p2) (tolower(*(p1)) == tolower(*(p2)))
#define NextChar(p, plen) (p)++, (plen)--
#endif
static int
MatchText(PG_CHAR * t, int tlen, PG_CHAR * p, int plen, char *e)
{
/* Fast path for match-everything pattern
* Include weird case of escape character as a percent sign or underscore,
* when presumably that wildcard character becomes a literal.
*/
if ((plen == 1) && (*p == '%')
&& ! ((e != NULL) && (*e == '%')))
return LIKE_TRUE;
while ((tlen > 0) && (plen > 0))
{
/* If an escape character was specified and we find it here in the pattern,
* then we'd better have an exact match for the next character.
*/
if ((e != NULL) && CHAREQ(p,e))
{
NextChar(p, plen);
if ((plen <= 0) || !CHAREQ(t,p))
return LIKE_FALSE;
}
else if (*p == '%')
{
/* %% is the same as % according to the SQL standard */
/* Advance past all %'s */
while ((plen > 0) && (*p == '%'))
NextChar(p, plen);
/* Trailing percent matches everything. */
if (plen <= 0)
return LIKE_TRUE;
/*
* Otherwise, scan for a text position at which we can
* match the rest of the pattern.
*/
while (tlen > 0)
{
/*
* Optimization to prevent most recursion: don't
* recurse unless first pattern char might match this
* text char.
*/
if (CHAREQ(t,p) || (*p == '_')
|| ((e != NULL) && CHAREQ(p,e)))
{
int matched = MatchText(t, tlen, p, plen, e);
if (matched != LIKE_FALSE)
return matched; /* TRUE or ABORT */
}
NextChar(t, tlen);
}
/*
* End of text with no match, so no point in trying later
* places to start matching this pattern.
*/
return LIKE_ABORT;
}
else if ((*p != '_') && !CHAREQ(t,p))
{
/* Not the single-character wildcard and no explicit match?
* Then time to quit...
*/
return LIKE_FALSE;
}
NextChar(t, tlen);
NextChar(p, plen);
}
if (tlen > 0)
return LIKE_FALSE; /* end of pattern, but not of text */
/* End of input string. Do we have matching pattern remaining? */
while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of pattern */
NextChar(p, plen);
if (plen <= 0)
return LIKE_TRUE;
/*
* End of text with no match, so no point in trying later places to
* start matching this pattern.
*/
return LIKE_ABORT;
} /* MatchText() */
static int
MatchTextLower(PG_CHAR * t, int tlen, PG_CHAR * p, int plen, char *e)
{
/* Fast path for match-everything pattern
* Include weird case of escape character as a percent sign or underscore,
* when presumably that wildcard character becomes a literal.
*/
if ((plen == 1) && (*p == '%')
&& ! ((e != NULL) && (*e == '%')))
return LIKE_TRUE;
while ((tlen > 0) && (plen > 0))
{
/* If an escape character was specified and we find it here in the pattern,
* then we'd better have an exact match for the next character.
*/
if ((e != NULL) && ICHAREQ(p,e))
{
NextChar(p, plen);
if ((plen <= 0) || !ICHAREQ(t,p))
return LIKE_FALSE;
}
else if (*p == '%')
{
/* %% is the same as % according to the SQL standard */
/* Advance past all %'s */
while ((plen > 0) && (*p == '%'))
NextChar(p, plen);
/* Trailing percent matches everything. */
if (plen <= 0)
return LIKE_TRUE;
/*
* Otherwise, scan for a text position at which we can
* match the rest of the pattern.
*/
while (tlen > 0)
{
/*
* Optimization to prevent most recursion: don't
* recurse unless first pattern char might match this
* text char.
*/
if (ICHAREQ(t,p) || (*p == '_')
|| ((e != NULL) && ICHAREQ(p,e)))
{
int matched = MatchText(t, tlen, p, plen, e);
if (matched != LIKE_FALSE)
return matched; /* TRUE or ABORT */
}
NextChar(t, tlen);
}
/*
* End of text with no match, so no point in trying later
* places to start matching this pattern.
*/
return LIKE_ABORT;
}
else if ((*p != '_') && !ICHAREQ(t,p))
{
return LIKE_FALSE;
}
NextChar(t, tlen);
NextChar(p, plen);
}
if (tlen > 0)
return LIKE_FALSE; /* end of pattern, but not of text */
/* End of input string. Do we have matching pattern remaining? */
while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of pattern */
NextChar(p, plen);
if (plen <= 0)
return LIKE_TRUE;
/*
* End of text with no match, so no point in trying later places to
* start matching this pattern.
*/
return LIKE_ABORT;
} /* MatchTextLower() */
main()
{
unsigned char *t = "Z01";
unsigned char *p = "_Z%";
int tlen, plen;
tlen = strlen(t);
plen = strlen(p);
printf("%d\n",MatchTextLower(t,tlen,p,plen,"\\"));
}
......@@ -3,7 +3,7 @@
* client encoding and server internal encoding.
* (currently mule internal code (mic) is used)
* Tatsuo Ishii
* $Id: mbutils.c,v 1.11 2000/08/27 10:40:48 ishii Exp $ */
* $Id: mbutils.c,v 1.12 2000/10/12 06:06:50 ishii Exp $ */
#include "postgres.h"
......@@ -21,8 +21,8 @@ static void (*server_from_mic) (); /* MIC to something */
/*
* find encoding table entry by encoding
*/
static pg_encoding_conv_tbl *
get_enc_ent(int encoding)
pg_encoding_conv_tbl *
pg_get_enc_ent(int encoding)
{
pg_encoding_conv_tbl *p = pg_conv_tbl;
......@@ -35,8 +35,8 @@ get_enc_ent(int encoding)
}
/*
* set the client encoding. if client/server encoding is
* not supported, returns -1
* set the client encoding. if encoding conversion between
* client/server encoding is not supported, returns -1
*/
int
pg_set_client_encoding(int encoding)
......@@ -52,8 +52,8 @@ pg_set_client_encoding(int encoding)
}
else if (current_server_encoding == MULE_INTERNAL)
{ /* server == MULE_INETRNAL? */
client_to_mic = get_enc_ent(encoding)->to_mic;
client_from_mic = get_enc_ent(encoding)->from_mic;
client_to_mic = pg_get_enc_ent(encoding)->to_mic;
client_from_mic = pg_get_enc_ent(encoding)->from_mic;
server_to_mic = server_from_mic = 0;
if (client_to_mic == 0 || client_from_mic == 0)
return (-1);
......@@ -61,17 +61,33 @@ pg_set_client_encoding(int encoding)
else if (encoding == MULE_INTERNAL)
{ /* client == MULE_INETRNAL? */
client_to_mic = client_from_mic = 0;
server_to_mic = get_enc_ent(current_server_encoding)->to_mic;
server_from_mic = get_enc_ent(current_server_encoding)->from_mic;
server_to_mic = pg_get_enc_ent(current_server_encoding)->to_mic;
server_from_mic = pg_get_enc_ent(current_server_encoding)->from_mic;
if (server_to_mic == 0 || server_from_mic == 0)
return (-1);
}
else if (current_server_encoding == UNICODE)
{ /* server == UNICODE? */
client_to_mic = pg_get_enc_ent(encoding)->to_unicode;
client_from_mic = pg_get_enc_ent(encoding)->from_unicode;
server_to_mic = server_from_mic = 0;
if (client_to_mic == 0 || client_from_mic == 0)
return (-1);
}
else if (encoding == UNICODE)
{ /* client == UNICODE? */
client_to_mic = client_from_mic = 0;
server_to_mic = pg_get_enc_ent(current_server_encoding)->to_unicode;
server_from_mic = pg_get_enc_ent(current_server_encoding)->from_unicode;
if (server_to_mic == 0 || server_from_mic == 0)
return (-1);
}
else
{
client_to_mic = get_enc_ent(encoding)->to_mic;
client_from_mic = get_enc_ent(encoding)->from_mic;
server_to_mic = get_enc_ent(current_server_encoding)->to_mic;
server_from_mic = get_enc_ent(current_server_encoding)->from_mic;
client_to_mic = pg_get_enc_ent(encoding)->to_mic;
client_from_mic = pg_get_enc_ent(encoding)->from_mic;
server_to_mic = pg_get_enc_ent(current_server_encoding)->to_mic;
server_from_mic = pg_get_enc_ent(current_server_encoding)->from_mic;
if (client_to_mic == 0 || client_from_mic == 0)
return (-1);
if (server_to_mic == 0 || server_from_mic == 0)
......@@ -193,6 +209,13 @@ pg_mblen(const unsigned char *mbstr)
return ((*pg_wchar_table[GetDatabaseEncoding()].mblen) (mbstr));
}
/* returns the byte length of a multi-byte word with specified enciding */
int
pg_mblen_with_encoding(const unsigned char *mbstr, int encoding)
{
return ((*pg_wchar_table[encoding].mblen) (mbstr));
}
/* returns the length (counted as a wchar) of a multi-byte string */
int
pg_mbstrlen(const unsigned char *mbstr)
......
#include "postgres.h"
#include "utils/memutils.h"
void
elog(int lev, const char *fmt,...)
{
printf(fmt);
}
MemoryContext CurrentMemoryContext;
void *
MemoryContextAlloc(MemoryContext context, Size size)
{
}
void
pfree(void *pointer)
{
}
void *
repalloc(void *pointer, Size size)
{
}
This diff is collapsed.
/*
* testing for sjis2mic() and mic2sjis()
*/
#include "conv.c"
int
main()
{
unsigned char eucbuf[1024];
unsigned char sjisbuf[1024];
unsigned char sjis[] = {0x81, 0x40, 0xa1, 0xf0, 0x40, 0xf0, 0x9e, 0xf5, 0x40, 0xfa, 0x40, 0xfa, 0x54, 0xfa, 0x7b, 0x00};
int i;
sjis2mic(sjis, eucbuf, 1024);
for (i = 0; i < 1024; i++)
{
if (eucbuf[i])
printf("%02x ", eucbuf[i]);
else
{
printf("\n");
break;
}
}
mic2sjis(eucbuf, sjisbuf, 1024);
for (i = 0; i < 1024; i++)
{
if (sjisbuf[i])
printf("%02x ", sjisbuf[i]);
else
{
printf("\n");
break;
}
}
return (0);
}
/*
* $Id: uconv.c,v 1.1 2000/10/12 06:06:50 ishii Exp $
*/
#include "pg_wchar.h"
/*
* convert UCS-2 to UTF-8
* returns number of bytes of a UTF-8, that is atmost 3.
*/
static int
pg_ucs2utf(const unsigned short ucs, unsigned char *utf)
{
int len;
if (ucs <= 0x007f)
{
*utf = ucs;
len = 1;
}
else if (ucs > 0x007f && ucs <= 0x07ff)
{
*utf++ = (ucs >> 6) | 0xc0;
*utf = (ucs & 0x003f) | 0x80;
len = 2;
}
else
{
*utf++ = (ucs >> 12) | 0xe0;
*utf++ = ((ucs & 0x0fc0) >> 6) | 0x80;
*utf = (ucs & 0x003f) | 0x80;
len = 3;
}
return (len);
}
typedef struct
{
unsigned short ucs; /* UCS-2 */
unsigned short code; /* local code */
unsigned char encoding; /* encoding */
} ucs_to_local;
typedef struct
{
unsigned short code; /* local code */
unsigned short ucs; /* UCS-2 */
} local_to_ucs;
#include "ucs_to_iso8859.map"
#include "iso88592.rev"
#include "iso88593.rev"
#include "iso88594.rev"
#include "iso88595.rev"
#define X0208 0
#define X0212 1
#include "ucs_to_jis.map"
int
main()
{
int i,j;
int l;
unsigned int euc;
unsigned char u[4];
FILE *fd;
printf("static pg_utf_to_local mapISO8859[] = {\n");
for (i=0;i<sizeof(mapISO8859)/sizeof(ucs_to_local);i++) {
if (mapISO8859[i].encoding > LATIN5)
continue;
l = pg_ucs2utf(mapISO8859[i].ucs, u);
printf(" {0x");
for(j=0;j<l;j++) {
printf("%02x", u[j]);
}
printf(", 0x%04x, %s},\n",
mapISO8859[i].code|0x80,
pg_get_enc_ent(mapISO8859[i].encoding)->name);
}
printf("};\n");
printf("\nstatic pg_local_to_utf ISO8859_2[] = {\n");
for (i=0;i<sizeof(revISO8859_2)/sizeof(local_to_ucs);i++) {
l = pg_ucs2utf(revISO8859_2[i].ucs, u);
printf(" {0x%04x, ", revISO8859_2[i].code|0x80);
printf("0x");
for(j=0;j<l;j++) {
printf("%02x", u[j]);
}
printf("},\n");
}
printf("};\n");
printf("\nstatic pg_local_to_utf ISO8859_3[] = {\n");
for (i=0;i<sizeof(revISO8859_3)/sizeof(local_to_ucs);i++) {
l = pg_ucs2utf(revISO8859_3[i].ucs, u);
printf(" {0x%04x, ", revISO8859_3[i].code|0x80);
printf("0x");
for(j=0;j<l;j++) {
printf("%02x", u[j]);
}
printf("},\n");
}
printf("};\n");
printf("\nstatic pg_local_to_utf ISO8859_4[] = {\n");
for (i=0;i<sizeof(revISO8859_4)/sizeof(local_to_ucs);i++) {
l = pg_ucs2utf(revISO8859_4[i].ucs, u);
printf(" {0x%04x, ", revISO8859_4[i].code|0x80);
printf("0x");
for(j=0;j<l;j++) {
printf("%02x", u[j]);
}
printf("},\n");
}
printf("};\n");
printf("\nstatic pg_local_to_utf ISO8859_5[] = {\n");
for (i=0;i<sizeof(revISO8859_5)/sizeof(local_to_ucs);i++) {
l = pg_ucs2utf(revISO8859_5[i].ucs, u);
printf(" {0x%04x, ", revISO8859_5[i].code|0x80);
printf("0x");
for(j=0;j<l;j++) {
printf("%02x", u[j]);
}
printf("},\n");
}
printf("};\n");
fd = fopen("UTF_to_EUC_JP.map", "w");
fprintf(fd, "static pg_utf_to_local mapUTF_to_EUC_JP[] = {\n");
for (i=0;i<sizeof(mapJIS)/sizeof(ucs_to_local);i++) {
l = pg_ucs2utf(mapJIS[i].ucs, u);
fprintf(fd, " {0x");
for(j=0;j<l;j++) {
fprintf(fd, "%02x", u[j]);
}
if (mapJIS[i].encoding == X0208)
{
euc = mapJIS[i].code|0x8080;
}
else
{
euc = SS3 << 16 | mapJIS[i].code | 0x8080;
}
fprintf(fd, ", 0x%04x, %s},\n",
euc,
"EUC_JP");
}
fprintf(fd, "};\n");
fclose(fd);
return(0);
}
/*
* $Id: uconv2.c,v 1.1 2000/10/12 06:06:50 ishii Exp $
*/
#include "pg_wchar.h"
#include "UTF_to_EUC_JP.map"
static int compare1(const void *p1, const void *p2)
{
unsigned int v1, v2;
v1 = ((pg_utf_to_local *)p1)->code;
v2 = ((pg_utf_to_local *)p2)->code;
return(v1 - v2);
}
int
main()
{
int i;
FILE *fd;
qsort(mapUTF_to_EUC_JP, sizeof(mapUTF_to_EUC_JP)/sizeof(pg_utf_to_local),
sizeof(pg_utf_to_local),compare1);
fd = fopen("EUC_JP_to_UTF.map", "w");
fprintf(fd, "static pg_local_to_utf mapEUC_JP_to_UTF[] = {\n");
for (i=0;i<sizeof(mapUTF_to_EUC_JP)/sizeof(pg_utf_to_local);i++) {
fprintf(fd, " {0x%08x, 0x%08x},\n",
mapUTF_to_EUC_JP[i].code,
mapUTF_to_EUC_JP[i].utf);
}
fprintf(fd, "};\n");
fclose(fd);
return(0);
}
/*
* testing of utf2wchar()
* $Id: utftest.c,v 1.3 1999/07/15 23:03:31 momjian Exp $
* $Id: utftest.c,v 1.4 2000/10/12 06:06:50 ishii Exp $
*/
#include "regex/regex.h"
#include "regex/utils.h"
#include "regex/regex2.h"
#include "regex/pg_wchar.h"
#include "conv.c"
#include "wchar.c"
#include "mbutils.c"
int
main()
{
/* Example 1 from RFC2044 */
......@@ -21,11 +19,17 @@ main()
char *utf[] = {utf1, utf2, utf3};
pg_wchar ucs[128];
pg_wchar *p;
unsigned char iso[1024];
int i;
/* UTF8-->ISO8859-2 test */
unsigned char utf_iso8859_2[] = {0x01, 0x00, 0x01, 0x02, 0x01, 0x55, 0x02, 0xdd, 0x00};
printf("===== testing of pg_utf2wchar_with_len =====\n");
for (i = 0; i < sizeof(utf) / sizeof(char *); i++)
{
pg_utf2wchar(utf[i], ucs);
pg_utf2wchar_with_len(utf[i], ucs, 128);
p = ucs;
while (*p)
{
......@@ -34,4 +38,16 @@ main()
}
printf("\n");
}
printf("===== testing of utf_to_latin2 =====\n");
utf_to_latin(utf_iso8859_2, iso, LATIN2, 128);
for (i = 0; i < sizeof(iso) / sizeof(char *); i++)
{
printf("%04x ", iso[i]);
if (iso[i] == 0x00)
break;
}
printf("\n");
return(0);
}
/*
* conversion functions between pg_wchar and multi-byte streams.
* Tatsuo Ishii
* $Id: wchar.c,v 1.12 2000/08/27 10:40:48 ishii Exp $
* $Id: wchar.c,v 1.13 2000/10/12 06:06:50 ishii Exp $
*
* WIN1250 client encoding updated by Pavel Behal
*
......@@ -246,7 +246,7 @@ pg_euctw_mblen(const unsigned char *s)
}
/*
* convert UTF-8 to pg_wchar (UCS-2)
* convert UTF-8 string to pg_wchar (UCS-2)
* caller should allocate enough space for "to"
* len: length of from.
* "from" not necessarily null terminated.
......@@ -296,7 +296,10 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar * to, int len)
return(cnt);
}
static int
/*
* returns the byte length of a UTF-8 word pointed to by s
*/
int
pg_utf_mblen(const unsigned char *s)
{
int len = 1;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment