Commit 14f72b9a authored by Tatsuo Ishii's avatar Tatsuo Ishii

Add GB18030 support. Contributed by Bill Huang <bill_huanghb@ybb.ne.jp>

(ODBC support has not been committed yet. left for Hiroshi...)
parent 620dbc98
This source diff could not be displayed because it is too large. You can view the blob instead.
#! /usr/bin/perl
#
# Copyright 2002 by Bill Huang
#
# $Id: UCS_to_GB18030.pl,v 1.1 2002/06/13 08:28:55 ishii Exp $
#
# Generate UTF-8 <--> GB18030 code conversion tables from
# map files provided by Unicode organization.
# Unfortunately it is prohibited by the organization
# to distribute the map files. So if you try to use this script,
# you have to obtain ISO10646-GB18030.TXT from
# the organization's ftp site.
#
# ISO10646-GB18030.TXT format:
# GB18030 code in hex
# UCS-2 code in hex
# # and Unicode name (not used in this script)
require "ucs2utf.pl";
# first generate UTF-8 --> GB18030 table
$in_file = "ISO10646-GB18030.TXT";
open( FILE, $in_file ) || die( "cannot open $in_file" );
while( <FILE> ){
chop;
if( /^#/ ){
next;
}
( $u, $c, $rest ) = split;
$utf = hex($u);
$code = hex($c);
$count++;
$array{ $utf } = ($code);
}
close( FILE );
#
# first, generate UTF8 --> GB18030 table
#
$file = "utf8_to_gb18030.map";
open( FILE, "> $file" ) || die( "cannot open $file" );
print FILE "static pg_utf_to_local ULmapGB18030[ $count ] = {\n";
for $index ( sort {$a <=> $b} keys( %array ) ){
$code = $array{ $index };
$count--;
if( $count == 0 ){
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
} else {
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
}
}
print FILE "};\n";
close(FILE);
#
# then generate GB18030 --> UTF8 table
#
reset 'array';
open( FILE, $in_file ) || die( "cannot open $in_file" );
while( <FILE> ){
chop;
if( /^#/ ){
next;
}
( $u, $c, $rest ) = split;
$utf = hex($u);
$code = hex($c);
$count++;
$array{ $code } = $utf;
}
close( FILE );
$file = "gb18030_to_utf8.map";
open( FILE, "> $file" ) || die( "cannot open $file" );
print FILE "static pg_local_to_utf LUmapGB18030[ $count ] = {\n";
for $index ( sort {$a <=> $b} keys( %array ) ){
$utf = $array{ $index };
$count--;
if( $count == 0 ){
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
} else {
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
}
}
print FILE "};\n";
close(FILE);
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -6,7 +6,7 @@
* WIN1250 client encoding support contributed by Pavel Behal
* SJIS UDC (NEC selection IBM kanji) support contributed by Eiji Tokuya
*
* $Id: conv.c,v 1.37 2002/03/06 06:10:26 momjian Exp $
* $Id: conv.c,v 1.38 2002/06/13 08:28:54 ishii Exp $
*
*
*/
......@@ -48,6 +48,8 @@
#include "Unicode/euc_jp_to_utf8.map"
#include "Unicode/utf8_to_euc_cn.map"
#include "Unicode/euc_cn_to_utf8.map"
#include "Unicode/utf8_to_gb18030.map"
#include "Unicode/gb18030_to_utf8.map"
#include "Unicode/utf8_to_euc_kr.map"
#include "Unicode/euc_kr_to_utf8.map"
#include "Unicode/utf8_to_euc_tw.map"
......@@ -515,6 +517,96 @@ mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
*p = '\0';
}
/*
* GB18030 ---> MIC
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
*/
static void
gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
{
int c1;
int c2;
while (len > 0 && (c1 = *gb18030++))
{
if (c1 < 0x80)
{ /* should be ASCII */
len--;
*p++ = c1;
}
else if(c1 >= 0x81 && c1 <= 0xfe)
{
c2 = *gb18030++;
if(c2 >= 0x30 && c2 <= 0x69){
len -= 4;
*p++ = c1;
*p++ = c2;
*p++ = *gb18030++;
*p++ = *gb18030++;
*p++ = *gb18030++;
}
else if ((c2 >=0x40 && c2 <= 0x7e) ||(c2 >=0x80 && c2 <= 0xfe)){
len -= 2;
*p++ = c1;
*p++ = c2;
*p++ = *gb18030++;
}
else{ /*throw the strange code*/
len--;
}
}
}
*p = '\0';
}
/*
* MIC ---> GB18030
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
*/
static void
mic2gb18030(unsigned char *mic, unsigned char *p, int len)
{
int c1;
int c2;
while (len > 0 && (c1 = *mic))
{
len -= pg_mic_mblen(mic++);
if (c1 <= 0x7f) /*ASCII*/
{
*p++ = c1;
}
else if (c1 >= 0x81 && c1 <= 0xfe)
{
c2 = *mic++;
if((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe)){
*p++ = c1;
*p++ = c2;
}
else if(c2 >= 0x30 && c2 <= 0x39){
*p++ = c1;
*p++ = c2;
*p++ = *mic++;
*p++ = *mic++;
}
else{
mic--;
printBogusChar(&mic, &p);
mic--;
printBogusChar(&mic, &p);
}
}
else{
mic--;
printBogusChar(&mic, &p);
}
}
*p = '\0';
}
/*
* EUC_TW ---> MIC
*/
......@@ -1596,6 +1688,26 @@ euc_cn_to_utf(unsigned char *euc, unsigned char *utf, int len)
sizeof(LUmapEUC_CN) / sizeof(pg_local_to_utf), PG_EUC_CN, len);
}
/*
* UTF-8 ---> GB18030
*/
static void
utf_to_gb18030(unsigned char *utf, unsigned char *euc, int len)
{
utf_to_local(utf, euc, ULmapGB18030,
sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len);
}
/*
* GB18030 ---> UTF-8
*/
static void
gb18030_to_utf(unsigned char *euc, unsigned char *utf, int len)
{
local_to_utf(euc, utf, LUmapGB18030,
sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), PG_GB18030, len);
}
/*
* UTF-8 ---> EUC_KR
*/
......@@ -1935,6 +2047,9 @@ pg_enconv pg_enconv_tbl[] =
{
PG_WIN1250, win12502mic, mic2win1250, win1250_to_utf, utf_to_win1250
},
{
PG_GB18030, gb180302mic, mic2gb18030, gb18030_to_utf, utf_to_gb18030
},
};
#else
......@@ -2019,9 +2134,18 @@ pg_enconv pg_enconv_tbl[] =
{
PG_BIG5, big52mic, mic2big5, 0, 0
},
{
PG_GBK, 0, 0, 0, 0
},
{
PG_UHC, 0, 0, 0, 0
},
{
PG_WIN1250, win12502mic, mic2win1250, 0, 0
},
{
PG_GB18030, gb180302mic, mic2gb18030, 0, 0
},
};
#endif /* UNICODE_CONVERSION */
......@@ -2,7 +2,7 @@
* Encoding names and routines for work with it. All
* in this file is shared bedween FE and BE.
*
* $Id: encnames.c,v 1.7 2002/03/05 05:52:44 momjian Exp $
* $Id: encnames.c,v 1.8 2002/06/13 08:28:54 ishii Exp $
*/
#ifdef FRONTEND
#include "postgres_fe.h"
......@@ -60,7 +60,11 @@ pg_encname pg_encname_tbl[] =
{
"euctw", PG_EUC_TW
}, /* EUC-TW; Extended Unix Code for
* traditional Chinese */
{
"gb18030", PG_GB18030
}, /* GB18030;GB18030 */
{
"gbk", PG_GBK
}, /* GBK; Chinese Windows CodePage 936
......@@ -239,7 +243,6 @@ pg_encname pg_encname_tbl[] =
{
"windows950", PG_BIG5
}, /* alias for BIG5 */
{
NULL, 0
} /* last */
......@@ -353,6 +356,9 @@ pg_enc2name pg_enc2name_tbl[] =
},
{
"WIN1250", PG_WIN1250
},
{
"GB18030", PG_GB18030
}
};
......
/*
* conversion functions between pg_wchar and multi-byte streams.
* Tatsuo Ishii
* $Id: wchar.c,v 1.27 2002/03/05 05:52:44 momjian Exp $
* $Id: wchar.c,v 1.28 2002/06/13 08:28:54 ishii Exp $
*
* WIN1250 client encoding updated by Pavel Behal
*
......@@ -510,6 +510,31 @@ pg_uhc_mblen(const unsigned char *s)
return (len);
}
/*
* * GB18030
* * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
* */
static int
pg_gb18030_mblen(const unsigned char *s)
{
int len;
if (*s <= 0x7f)
{ /* ASCII */
len = 1;
}
else
{
if((*(s+1) >= 0x40 && *(s+1) <= 0x7e)|| (*(s+1) >= 0x80 && *(s+1) <= 0xfe))
len = 2;
else if(*(s+1) >= 0x30 && *(s+1) <= 0x39)
len = 4;
else
len = 2;
}
return (len);
}
pg_wchar_tbl pg_wchar_table[] = {
{pg_ascii2wchar_with_len, pg_ascii_mblen, 1}, /* 0; PG_SQL_ASCII */
{pg_eucjp2wchar_with_len, pg_eucjp_mblen, 3}, /* 1; PG_EUC_JP */
......@@ -544,6 +569,7 @@ pg_wchar_tbl pg_wchar_table[] = {
{0, pg_gbk_mblen, 2}, /* 30; PG_GBK */
{0, pg_uhc_mblen, 2}, /* 31; PG_UHC */
{pg_latin12wchar_with_len, pg_latin1_mblen, 1}, /* 32; PG_WIN1250 */
{0, pg_gb18030_mblen, 2} /* 33; PG_GB18030 */
};
/* returns the byte length of a word for mule internal code */
......
/* $Id: pg_wchar.h,v 1.38 2002/03/05 05:52:50 momjian Exp $ */
/* $Id: pg_wchar.h,v 1.39 2002/06/13 08:30:22 ishii Exp $ */
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
......@@ -189,7 +189,7 @@ typedef enum pg_enc
PG_GBK, /* GBK (Windows-936) */
PG_UHC, /* UHC (Windows-949) */
PG_WIN1250, /* windows-1250 */
PG_GB18030, /* GB18030 */
_PG_LAST_ENCODING_ /* mark only */
} pg_enc;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment