Commit 14f72b9a authored by Tatsuo Ishii's avatar Tatsuo Ishii

Add GB18030 support. Contributed by Bill Huang <bill_huanghb@ybb.ne.jp>

(ODBC support has not been committed yet. left for Hiroshi...)
parent 620dbc98
This diff is collapsed.
#! /usr/bin/perl
#
# Copyright 2002 by Bill Huang
#
# $Id: UCS_to_GB18030.pl,v 1.1 2002/06/13 08:28:55 ishii Exp $
#
# Generate UTF-8 <--> GB18030 code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain ISO10646-GB18030.TXT from
# the organization's ftp site.
#
# ISO10646-GB18030.TXT format:
# GB18030 code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
# first generate UTF-8 --> GB18030 table
$in_file = "ISO10646-GB18030.TXT";
open( FILE, $in_file ) || die( "cannot open $in_file" );
while( <FILE> ){
chop;
if( /^#/ ){
next;
}
( $u, $c, $rest ) = split;
$utf = hex($u);
$code = hex($c);
$count++;
$array{ $utf } = ($code);
}
close( FILE );
#
# first, generate UTF8 --> GB18030 table
#
$file = "utf8_to_gb18030.map";
open( FILE, "> $file" ) || die( "cannot open $file" );
print FILE "static pg_utf_to_local ULmapGB18030[ $count ] = {\n";
for $index ( sort {$a <=> $b} keys( %array ) ){
$code = $array{ $index };
$count--;
if( $count == 0 ){
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
} else {
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate GB18030 --> UTF8 table
#
reset 'array';
open( FILE, $in_file ) || die( "cannot open $in_file" );
while( <FILE> ){
chop;
if( /^#/ ){
next;
}
( $u, $c, $rest ) = split;
$utf = hex($u);
$code = hex($c);
$count++;
$array{ $code } = $utf;
}
close( FILE );
$file = "gb18030_to_utf8.map";
open( FILE, "> $file" ) || die( "cannot open $file" );
print FILE "static pg_local_to_utf LUmapGB18030[ $count ] = {\n";
for $index ( sort {$a <=> $b} keys( %array ) ){
$utf = $array{ $index };
$count--;
if( $count == 0 ){
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
} else {
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
This diff is collapsed.
This diff is collapsed.
......@@ -6,7 +6,7 @@
* WIN1250 client encoding support contributed by Pavel Behal
* SJIS UDC (NEC selection IBM kanji) support contributed by Eiji Tokuya
*
* $Id: conv.c,v 1.37 2002/03/06 06:10:26 momjian Exp $
* $Id: conv.c,v 1.38 2002/06/13 08:28:54 ishii Exp $
*
*
*/
......@@ -48,6 +48,8 @@
#include "Unicode/euc_jp_to_utf8.map"
#include "Unicode/utf8_to_euc_cn.map"
#include "Unicode/euc_cn_to_utf8.map"
#include "Unicode/utf8_to_gb18030.map"
#include "Unicode/gb18030_to_utf8.map"
#include "Unicode/utf8_to_euc_kr.map"
#include "Unicode/euc_kr_to_utf8.map"
#include "Unicode/utf8_to_euc_tw.map"
......@@ -515,6 +517,96 @@ mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
*p = '\0';
}
/*
* GB18030 ---> MIC
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
*/
static void
gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
{
int c1;
int c2;
while (len > 0 && (c1 = *gb18030++))
{
if (c1 < 0x80)
{ /* should be ASCII */
len--;
*p++ = c1;
}
else if(c1 >= 0x81 && c1 <= 0xfe)
{
c2 = *gb18030++;
if(c2 >= 0x30 && c2 <= 0x69){
len -= 4;
*p++ = c1;
*p++ = c2;
*p++ = *gb18030++;
*p++ = *gb18030++;
*p++ = *gb18030++;
}
else if ((c2 >=0x40 && c2 <= 0x7e) ||(c2 >=0x80 && c2 <= 0xfe)){
len -= 2;
*p++ = c1;
*p++ = c2;
*p++ = *gb18030++;
}
else{ /*throw the strange code*/
len--;
}
}
}
*p = '\0';
}
/*
* MIC ---> GB18030
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
*/
static void
mic2gb18030(unsigned char *mic, unsigned char *p, int len)
{
int c1;
int c2;
while (len > 0 && (c1 = *mic))
{
len -= pg_mic_mblen(mic++);
if (c1 <= 0x7f) /*ASCII*/
{
*p++ = c1;
}
else if (c1 >= 0x81 && c1 <= 0xfe)
{
c2 = *mic++;
if((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)){
*p++ = c1;
*p++ = c2;
}
else if(c2 >= 0x30 && c2 <= 0x39){
*p++ = c1;
*p++ = c2;
*p++ = *mic++;
*p++ = *mic++;
}
else{
mic--;
printBogusChar(&mic, &p);
mic--;
printBogusChar(&mic, &p);
}
}
else{
mic--;
printBogusChar(&mic, &p);
}
}
*p = '\0';
}
/*
* EUC_TW ---> MIC
*/
......@@ -1596,6 +1688,26 @@ euc_cn_to_utf(unsigned char *euc, unsigned char *utf, int len)
sizeof(LUmapEUC_CN) / sizeof(pg_local_to_utf), PG_EUC_CN, len);
}
/*
* UTF-8 ---> GB18030
*/
static void
utf_to_gb18030(unsigned char *utf, unsigned char *euc, int len)
{
utf_to_local(utf, euc, ULmapGB18030,
sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len);
}
/*
* GB18030 ---> UTF-8
*/
static void
gb18030_to_utf(unsigned char *euc, unsigned char *utf, int len)
{
local_to_utf(euc, utf, LUmapGB18030,
sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), PG_GB18030, len);
}
/*
* UTF-8 ---> EUC_KR
*/
......@@ -1935,6 +2047,9 @@ pg_enconv pg_enconv_tbl[] =
{
PG_WIN1250, win12502mic, mic2win1250, win1250_to_utf, utf_to_win1250
},
{
PG_GB18030, gb180302mic, mic2gb18030, gb18030_to_utf, utf_to_gb18030
},
};
#else
......@@ -2019,9 +2134,18 @@ pg_enconv pg_enconv_tbl[] =
{
PG_BIG5, big52mic, mic2big5, 0, 0
},
{
PG_GBK, 0, 0, 0, 0
},
{
PG_UHC, 0, 0, 0, 0
},
{
PG_WIN1250, win12502mic, mic2win1250, 0, 0
},
{
PG_GB18030, gb180302mic, mic2gb18030, 0, 0
},
};
#endif /* UNICODE_CONVERSION */
......@@ -2,7 +2,7 @@
* Encoding names and routines for work with it. All
* in this file is shared bedween FE and BE.
*
* $Id: encnames.c,v 1.7 2002/03/05 05:52:44 momjian Exp $
* $Id: encnames.c,v 1.8 2002/06/13 08:28:54 ishii Exp $
*/
#ifdef FRONTEND
#include "postgres_fe.h"
......@@ -60,7 +60,11 @@ pg_encname pg_encname_tbl[] =
{
"euctw", PG_EUC_TW
}, /* EUC-TW; Extended Unix Code for
* traditional Chinese */
{
"gb18030", PG_GB18030
}, /* GB18030;GB18030 */
{
"gbk", PG_GBK
}, /* GBK; Chinese Windows CodePage 936
......@@ -239,7 +243,6 @@ pg_encname pg_encname_tbl[] =
{
"windows950", PG_BIG5
}, /* alias for BIG5 */
{
NULL, 0
} /* last */
......@@ -353,6 +356,9 @@ pg_enc2name pg_enc2name_tbl[] =
},
{
"WIN1250", PG_WIN1250
},
{
"GB18030", PG_GB18030
}
};
......
/*
* conversion functions between pg_wchar and multi-byte streams.
* Tatsuo Ishii
* $Id: wchar.c,v 1.27 2002/03/05 05:52:44 momjian Exp $
* $Id: wchar.c,v 1.28 2002/06/13 08:28:54 ishii Exp $
*
* WIN1250 client encoding updated by Pavel Behal
*
......@@ -510,6 +510,31 @@ pg_uhc_mblen(const unsigned char *s)
return (len);
}
/*
* * GB18030
* * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
* */
static int
pg_gb18030_mblen(const unsigned char *s)
{
int len;
if (*s <= 0x7f)
{ /* ASCII */
len = 1;
}
else
{
if((*(s+1) >= 0x40 && *(s+1) <= 0x7e)|| (*(s+1) >= 0x80 && *(s+1) <= 0xfe))
len = 2;
else if(*(s+1) >= 0x30 && *(s+1) <= 0x39)
len = 4;
else
len = 2;
}
return (len);
}
pg_wchar_tbl pg_wchar_table[] = {
{pg_ascii2wchar_with_len, pg_ascii_mblen, 1}, /* 0; PG_SQL_ASCII */
{pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3}, /* 1; PG_EUC_JP */
......@@ -544,6 +569,7 @@ pg_wchar_tbl pg_wchar_table[] = {
{0, pg_gbk_mblen, 2}, /* 30; PG_GBK */
{0, pg_uhc_mblen, 2}, /* 31; PG_UHC */
{pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 32; PG_WIN1250 */
{0, pg_gb18030_mblen, 2} /* 33; PG_GB18030 */
};
/* returns the byte length of a word for mule internal code */
......
/* $Id: pg_wchar.h,v 1.38 2002/03/05 05:52:50 momjian Exp $ */
/* $Id: pg_wchar.h,v 1.39 2002/06/13 08:30:22 ishii Exp $ */
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
......@@ -189,7 +189,7 @@ typedef enum pg_enc
PG_GBK, /* GBK (Windows-936) */
PG_UHC, /* UHC (Windows-949) */
PG_WIN1250, /* windows-1250 */
PG_GB18030, /* GB18030 */
_PG_LAST_ENCODING_ /* mark only */
} pg_enc;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment