diff options
author | Rui Hirokawa <hirokawa@php.net> | 2001-09-15 04:48:48 +0000 |
---|---|---|
committer | Rui Hirokawa <hirokawa@php.net> | 2001-09-15 04:48:48 +0000 |
commit | 9c5580c7d41ffc3f261a02a2178c16fede235718 (patch) | |
tree | 815165e7a33269ff0fe3dc17a7ce6e0cf64c8833 | |
parent | 7c6b7baac9fedc1081eefb916abb61907fee73af (diff) | |
download | php-git-9c5580c7d41ffc3f261a02a2178c16fede235718.tar.gz |
Added support for japanese encoding to htmlentites() and htmlspecialchars(). @ Added support for japanese encoding to htmlentites() and htmlspecialchars(). (Rui)
-rw-r--r-- | ext/standard/html.c | 77 |
1 files changed, 76 insertions, 1 deletions
diff --git a/ext/standard/html.c b/ext/standard/html.c index 6a6c773140..092949c031 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -35,7 +35,8 @@ Defaults to ISO-8859-1 for now. */ enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252, - cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs }; + cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, + cs_big5hkscs, cs_sjis, cs_eucjp}; typedef const char * entity_table_t; /* codepage 1252 is a Windows extension to iso-8859-1. */ @@ -99,6 +100,8 @@ static const struct html_entity_map entity_map[] = { { cs_big5, 0xa0, 0xff, ent_iso_8859_1 }, { cs_gb2312, 0xa0, 0xff, ent_iso_8859_1 }, { cs_big5hkscs, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_sjis, 0xa0, 0xff, ent_iso_8859_1 }, + { cs_eucjp, 0xa0, 0xff, ent_iso_8859_1 }, { cs_terminator } }; @@ -113,6 +116,10 @@ static const struct { { "BIG5", cs_big5 }, { "GB2312", cs_gb2312 }, { "BIG5-HKSCS", cs_big5hkscs }, + { "Shift_JIS", cs_sjis }, + { "SJIS", cs_sjis }, + { "EUCJP", cs_eucjp }, + { "EUC-JP", cs_eucjp }, { NULL } }; @@ -236,6 +243,74 @@ inline static unsigned short get_next_char(enum entity_charset charset, } break; } + case cs_sjis: + { + /* check if this is the first of a 2-byte sequence */ + if ( (this_char >= 0x81 && this_char <= 0x9f) || + (this_char >= 0xe0 && this_char <= 0xef) + ) { + /* peek at the next char */ + unsigned char next_char = str[pos]; + if ((next_char >= 0x40 && next_char <= 0x7e) || + (next_char >= 0x80 && next_char <= 0xfc)) + { + /* yes, this a wide char */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + pos++; + } + + } + break; + } + case cs_eucjp: + { + /* check if this is the first of a multi-byte sequence */ + if (this_char >= 0xa1 && this_char <= 0xfe) { + /* peek at the next char */ + unsigned char next_char = str[pos]; + if (next_char >= 0xa1 && next_char <= 0xfe) + { + /* yes, this a jis kanji char */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + pos++; + } + + } else if (this_char == 0x8e) { + /* peek at the next char */ + unsigned char next_char = str[pos]; + if (next_char >= 0xa1 && next_char <= 0xdf) + { + /* JIS X 0201 kana */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + pos++; + } + + } else if (this_char == 0x8f) { + /* peek at the next two char */ + unsigned char next_char = str[pos]; + unsigned char next2_char = str[pos+1]; + if ((next_char >= 0xa1 && next_char <= 0xfe) && + (next2_char >= 0xa1 && next2_char <= 0xfe)) + { + /* JIS X 0212 hojo-kanji */ + this_char <<= 8; + mbseq[mbpos++] = next_char; + this_char |= next_char; + this_char <<= 8; + mbseq[mbpos++] = next2_char; + this_char |= next2_char; + pos+=2; + } + + } + break; + } } *newpos = pos; mbseq[mbpos] = '\0'; |