From afcde99b1bbb85ecffcc3d2dcdd414cc6f9866f8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 18 Mar 2009 16:26:18 +0000 Subject: Fix case of the just resurrected UCS_to_BIG5.pl script, and update Makefile to use it. --- src/backend/utils/mb/Unicode/Makefile | 12 +- src/backend/utils/mb/Unicode/UCS_to_BIG5.pl | 177 ++++++++++++++++++++++++++++ src/backend/utils/mb/Unicode/UCS_to_big5.pl | 177 ---------------------------- 3 files changed, 185 insertions(+), 181 deletions(-) create mode 100755 src/backend/utils/mb/Unicode/UCS_to_BIG5.pl delete mode 100644 src/backend/utils/mb/Unicode/UCS_to_big5.pl (limited to 'src') diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile index da605a3d49..c0eae653a9 100644 --- a/src/backend/utils/mb/Unicode/Makefile +++ b/src/backend/utils/mb/Unicode/Makefile @@ -4,7 +4,7 @@ # # Copyright (c) 2001-2009, PostgreSQL Global Development Group # -# $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/Makefile,v 1.15 2009/01/01 17:23:51 momjian Exp $ +# $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/Makefile,v 1.16 2009/03/18 16:26:18 heikki Exp $ # #------------------------------------------------------------------------- @@ -39,7 +39,6 @@ WINMAPS = win866_to_utf8.map utf8_to_win866.map \ win1258_to_utf8.map utf8_to_win1258.map GENERICMAPS = $(ISO8859MAPS) $(WINMAPS) \ - big5_to_utf8.map utf8_to_big5.map \ johab_to_utf8.map utf8_to_johab.map \ uhc_to_utf8.map utf8_to_uhc.map \ gbk_to_utf8.map utf8_to_gbk.map \ @@ -50,7 +49,8 @@ SPECIALMAPS = euc_cn_to_utf8.map utf8_to_euc_cn.map \ euc_kr_to_utf8.map utf8_to_euc_kr.map \ euc_tw_to_utf8.map utf8_to_euc_tw.map \ sjis_to_utf8.map utf8_to_sjis.map \ - gb18030_to_utf8.map utf8_to_gb18030.map + gb18030_to_utf8.map utf8_to_gb18030.map \ + big5_to_utf8.map utf8_to_big5.map MAPS = $(GENERICMAPS) $(SPECIALMAPS) @@ -64,7 +64,7 @@ WINTEXTS = CP866.TXT CP874.TXT CP1250.TXT CP1251.TXT \ CP1256.TXT CP1257.TXT CP1258.TXT GENERICTEXTS = $(ISO8859TEXTS) $(WINTEXTS) \ - KOI8-R.TXT CP936.TXT CP949.TXT JOHAB.TXT BIG5.TXT + KOI8-R.TXT CP936.TXT CP949.TXT JOHAB.TXT all: $(MAPS) @@ -88,6 +88,10 @@ sjis_to_utf8.map utf8_to_sjis.map : CP932.TXT gb18030_to_utf8.map utf8_to_gb18030.map : ISO10646-GB18030.TXT $(PERL) $(srcdir)/UCS_to_GB18030.pl + +big5_to_utf8.map utf8_to_big5.map : BIG5.TXT CP950.TXT + $(PERL) $(srcdir)/UCS_to_BIG5.pl + clean: rm -f $(MAPS) diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl new file mode 100755 index 0000000000..8b9612f118 --- /dev/null +++ b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl @@ -0,0 +1,177 @@ +#! /usr/bin/perl +# +# Copyright (c) 2001-2009, PostgreSQL Global Development Group +# +# $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl,v 1.9 2009/03/18 16:26:18 heikki Exp $ +# +# Generate UTF-8 <--> BIG5 conversion tables from +# map files provided by Unicode organization. +# Unfortunately it is prohibited by the organization +# to distribute the map files. So if you try to use this script, +# you have to obtain the map files from the organization's ftp site. +# ftp://www.unicode.org/Public/MAPPINGS/ +# +# Our "big5" comes from BIG5.TXT, with the addition of the characters +# in the range 0xf9d6-0xf9dc from CP950.TXT. +# +# BIG5.TXT format: +# BIG5 code in hex +# UCS-2 code in hex +# # and Unicode name (not used in this script) +# +# CP950.TXT format: +# CP950 code in hex +# UCS-2 code in hex +# # and Unicode name (not used in this script) + + +require "ucs2utf.pl"; + + +# +# first, generate UTF8 --> BIG5 table +# +$in_file = "BIG5.TXT"; + +open( FILE, $in_file ) || die( "cannot open $in_file" ); + +reset 'array'; + +while( ){ + chop; + if( /^#/ ){ + next; + } + ( $c, $u, $rest ) = split; + $ucs = hex($u); + $code = hex($c); + if( $code >= 0x80 && $ucs >= 0x0080){ + $utf = &ucs2utf($ucs); + if( $array{ $utf } ne "" ){ + printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; + next; + } + $count++; + $array{ $utf } = $code; + } +} +close( FILE ); + +$in_file = "CP950.TXT"; + +open( FILE, $in_file ) || die( "cannot open $in_file" ); + +while( ){ + chop; + if( /^#/ ){ + next; + } + ( $c, $u, $rest ) = split; + $ucs = hex($u); + $code = hex($c); + + # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc + # from CP950.TXT + if( $code >= 0x80 && $ucs >= 0x0080 && + $code >= 0xf9d6 && $code <= 0xf9dc ){ + $utf = &ucs2utf($ucs); + if( $array{ $utf } ne "" ){ + printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; + next; + } + $count++; + $array{ $utf } = $code; + } +} +close( FILE ); + +$file = lc("utf8_to_big5.map"); +open( FILE, "> $file" ) || die( "cannot open $file" ); +print FILE "static pg_utf_to_local ULmapBIG5[ $count ] = {\n"; + +for $index ( sort {$a <=> $b} keys( %array ) ){ + $code = $array{ $index }; + $count--; + if( $count == 0 ){ + printf FILE " {0x%04x, 0x%04x}\n", $index, $code; + } else { + printf FILE " {0x%04x, 0x%04x},\n", $index, $code; + } +} + +print FILE "};\n"; +close(FILE); + +# +# then generate BIG5 --> UTF8 table +# +$in_file = "BIG5.TXT"; + +open( FILE, $in_file ) || die( "cannot open $in_file" ); + +reset 'array'; + +while( ){ + chop; + if( /^#/ ){ + next; + } + ( $c, $u, $rest ) = split; + $ucs = hex($u); + $code = hex($c); + if( $code >= 0x80 && $ucs >= 0x0080){ + $utf = &ucs2utf($ucs); + if( $array{ $utf } ne "" ){ + printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; + next; + } + $count++; + $array{ $code } = $utf; + } +} +close( FILE ); + +$in_file = "CP950.TXT"; + +open( FILE, $in_file ) || die( "cannot open $in_file" ); + +while( ){ + chop; + if( /^#/ ){ + next; + } + ( $c, $u, $rest ) = split; + $ucs = hex($u); + $code = hex($c); + + # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc + # from CP950.TXT + if( $code >= 0x80 && $ucs >= 0x0080 && + $code >= 0xf9d6 && $code <= 0xf9dc ){ + $utf = &ucs2utf($ucs); + if( $array{ $utf } ne "" ){ + printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; + next; + } + $count++; + $array{ $code } = $utf; + } +} +close( FILE ); + +$file = lc("big5_to_utf8.map"); +open( FILE, "> $file" ) || die( "cannot open $file" ); +print FILE "static pg_local_to_utf LUmapBIG5[ $count ] = {\n"; +for $index ( sort {$a <=> $b} keys( %array ) ){ + $utf = $array{ $index }; + $count--; + if( $count == 0 ){ + printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; + } else { + printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; + } +} + +print FILE "};\n"; +close(FILE); + diff --git a/src/backend/utils/mb/Unicode/UCS_to_big5.pl b/src/backend/utils/mb/Unicode/UCS_to_big5.pl deleted file mode 100644 index 58886d67f1..0000000000 --- a/src/backend/utils/mb/Unicode/UCS_to_big5.pl +++ /dev/null @@ -1,177 +0,0 @@ -#! /usr/bin/perl -# -# Copyright (c) 2001-2009, PostgreSQL Global Development Group -# -# $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/UCS_to_big5.pl,v 1.1 2009/03/18 16:17:58 heikki Exp $ -# -# Generate UTF-8 <--> BIG5 conversion tables from -# map files provided by Unicode organization. -# Unfortunately it is prohibited by the organization -# to distribute the map files. So if you try to use this script, -# you have to obtain the map files from the organization's ftp site. -# ftp://www.unicode.org/Public/MAPPINGS/ -# -# Our "big5" comes from BIG5.TXT, with the addition of the characters -# in the range 0xf9d6-0xf9dc from CP950.TXT. -# -# BIG5.TXT format: -# BIG5 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) -# -# CP950.TXT format: -# CP950 code in hex -# UCS-2 code in hex -# # and Unicode name (not used in this script) - - -require "ucs2utf.pl"; - - -# -# first, generate UTF8 --> BIG5 table -# -$in_file = "BIG5.TXT"; - -open( FILE, $in_file ) || die( "cannot open $in_file" ); - -reset 'array'; - -while( ){ - chop; - if( /^#/ ){ - next; - } - ( $c, $u, $rest ) = split; - $ucs = hex($u); - $code = hex($c); - if( $code >= 0x80 && $ucs >= 0x0080){ - $utf = &ucs2utf($ucs); - if( $array{ $utf } ne "" ){ - printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; - next; - } - $count++; - $array{ $utf } = $code; - } -} -close( FILE ); - -$in_file = "CP950.TXT"; - -open( FILE, $in_file ) || die( "cannot open $in_file" ); - -while( ){ - chop; - if( /^#/ ){ - next; - } - ( $c, $u, $rest ) = split; - $ucs = hex($u); - $code = hex($c); - - # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc - # from CP950.TXT - if( $code >= 0x80 && $ucs >= 0x0080 && - $code >= 0xf9d6 && $code <= 0xf9dc ){ - $utf = &ucs2utf($ucs); - if( $array{ $utf } ne "" ){ - printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; - next; - } - $count++; - $array{ $utf } = $code; - } -} -close( FILE ); - -$file = lc("utf8_to_big5.map"); -open( FILE, "> $file" ) || die( "cannot open $file" ); -print FILE "static pg_utf_to_local ULmapBIG5[ $count ] = {\n"; - -for $index ( sort {$a <=> $b} keys( %array ) ){ - $code = $array{ $index }; - $count--; - if( $count == 0 ){ - printf FILE " {0x%04x, 0x%04x}\n", $index, $code; - } else { - printf FILE " {0x%04x, 0x%04x},\n", $index, $code; - } -} - -print FILE "};\n"; -close(FILE); - -# -# then generate BIG5 --> UTF8 table -# -$in_file = "BIG5.TXT"; - -open( FILE, $in_file ) || die( "cannot open $in_file" ); - -reset 'array'; - -while( ){ - chop; - if( /^#/ ){ - next; - } - ( $c, $u, $rest ) = split; - $ucs = hex($u); - $code = hex($c); - if( $code >= 0x80 && $ucs >= 0x0080){ - $utf = &ucs2utf($ucs); - if( $array{ $utf } ne "" ){ - printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; - next; - } - $count++; - $array{ $code } = $utf; - } -} -close( FILE ); - -$in_file = "CP950.TXT"; - -open( FILE, $in_file ) || die( "cannot open $in_file" ); - -while( ){ - chop; - if( /^#/ ){ - next; - } - ( $c, $u, $rest ) = split; - $ucs = hex($u); - $code = hex($c); - - # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc - # from CP950.TXT - if( $code >= 0x80 && $ucs >= 0x0080 && - $code >= 0xf9d6 && $code <= 0xf9dc ){ - $utf = &ucs2utf($ucs); - if( $array{ $utf } ne "" ){ - printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; - next; - } - $count++; - $array{ $code } = $utf; - } -} -close( FILE ); - -$file = lc("big5_to_utf8.map"); -open( FILE, "> $file" ) || die( "cannot open $file" ); -print FILE "static pg_local_to_utf LUmapBIG5[ $count ] = {\n"; -for $index ( sort {$a <=> $b} keys( %array ) ){ - $utf = $array{ $index }; - $count--; - if( $count == 0 ){ - printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; - } else { - printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; - } -} - -print FILE "};\n"; -close(FILE); - -- cgit v1.2.1