summaryrefslogtreecommitdiff
path: root/ext/mbstring
diff options
context:
space:
mode:
authorAlex Dowad <alexinbeijing@gmail.com>2020-11-14 23:03:03 +0200
committerAlex Dowad <alexinbeijing@gmail.com>2020-11-25 20:51:45 +0200
commit4f3bd2e235feffbdba7f4bb7e99502832eda5f5b (patch)
treee71a50fb6e271237fe981191dcb328aaa89cec51 /ext/mbstring
parent0d0029d729259f7977217fb94930f6fe973b1192 (diff)
downloadphp-git-4f3bd2e235feffbdba7f4bb7e99502832eda5f5b.tar.gz
Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants
Converting U+203E to 0x7E was especially wrong for CP932, where 0x7E represents a tilde. For vanilla Shift-JIS and Shift-JIS-2004, converting to 0x7E is acceptable, since 0x7E does represent an overline/macron in those encodings. Follow the same principle in CP51932, which is closely related to CP932.
Diffstat (limited to 'ext/mbstring')
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_cp51932.c2
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_cp932.c2
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_sjis.c4
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c2
-rw-r--r--ext/mbstring/libmbfl/filters/unicode_table_jis.h2
-rw-r--r--ext/mbstring/tests/cp51932_encoding.phpt2
-rw-r--r--ext/mbstring/tests/cp932_encoding.phpt3
-rw-r--r--ext/mbstring/tests/eucjp_encoding.phpt3
-rw-r--r--ext/mbstring/tests/sjismac_encoding.phpt3
9 files changed, 14 insertions, 9 deletions
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c
index aa52d05481..0be771c3b8 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c
@@ -216,8 +216,6 @@ mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter)
if (s1 <= 0) {
if (c == 0xa5) { /* YEN SIGN */
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
- } else if (c == 0x203e) { /* OVER LINE */
- s1 = 0x007e; /* FULLWIDTH MACRON */
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
s1 = 0x2140;
} else if (c == 0xff5e) { /* FULLWIDTH TILDE */
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c
index ec192faa2b..120c5e626d 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c
@@ -253,8 +253,6 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter)
if (s1 <= 0) {
if (c == 0xa5) { /* YEN SIGN */
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
- } else if (c == 0x203e) { /* OVER LINE */
- s1 = 0x007e; /* FULLWIDTH MACRON */
} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
s1 = 0x2140;
} else if (c == 0xff5e) { /* FULLWIDTH TILDE */
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c
index 36f374a952..455c49cb9a 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c
@@ -211,6 +211,8 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
/* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or
* macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */
s1 = 0x2141;
+ } else if (c == 0x203E) { /* U+203E is OVERLINE */
+ s1 = 0x7E; /* Halfwidth overline/macron */
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
@@ -223,8 +225,6 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
if (s1 <= 0) {
if (c == 0xA5) { /* YEN SIGN */
s1 = 0x5C;
- } else if (c == 0x203E) { /* OVER LINE */
- s1 = 0x7E;
} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
s1 = 0x2140;
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c
index 7f0ff31aff..bad0423503 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c
@@ -749,8 +749,6 @@ int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter)
if (s1 <= 0) {
if (c == 0xA5) { /* YEN SIGN */
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
- } else if (c == 0x203E) { /* OVER LINE */
- s1 = 0x2131; /* FULLWIDTH MACRON */
} else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */
s1 = 0x2140;
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */
diff --git a/ext/mbstring/libmbfl/filters/unicode_table_jis.h b/ext/mbstring/libmbfl/filters/unicode_table_jis.h
index 3236578f9b..450428c578 100644
--- a/ext/mbstring/libmbfl/filters/unicode_table_jis.h
+++ b/ext/mbstring/libmbfl/filters/unicode_table_jis.h
@@ -2444,7 +2444,7 @@ const unsigned short ucs_a2_jis_table[] = {
0x2277,0x2278,0x0000,0x0000,0x0000,0x2145,0x2144,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x2273,0x0000,0x216C,0x216D,0x0000,0x0000,0x0000,0x0000,
- 0x0000,0x0000,0x0000,0x2228,0x0000,0x0000,0x0000,0x0000,
+ 0x0000,0x0000,0x0000,0x2228,0x0000,0x0000,0x2131,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
diff --git a/ext/mbstring/tests/cp51932_encoding.phpt b/ext/mbstring/tests/cp51932_encoding.phpt
index 2fc25fd3c8..5bf9ffe42e 100644
--- a/ext/mbstring/tests/cp51932_encoding.phpt
+++ b/ext/mbstring/tests/cp51932_encoding.phpt
@@ -86,6 +86,8 @@ for ($i = 0; $i <= 0x7F; $i++)
/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
$fromUnicode["\x00\xA5"] = "\xA1\xEF";
+/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
+$fromUnicode["\x20\x3E"] = "\xA1\xB1";
testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);
testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);
diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt
index ec9e76f1f6..25844774de 100644
--- a/ext/mbstring/tests/cp932_encoding.phpt
+++ b/ext/mbstring/tests/cp932_encoding.phpt
@@ -47,6 +47,9 @@ $fromUnicode["\x20\x16"] = "\x81\x61";
* but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */
$fromUnicode["\x00\xAC"] = "\x81\xCA";
+/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */
+$fromUnicode["\x20\x3E"] = "\x81\x50";
+
findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
diff --git a/ext/mbstring/tests/eucjp_encoding.phpt b/ext/mbstring/tests/eucjp_encoding.phpt
index dc321d3bae..3a90431e4a 100644
--- a/ext/mbstring/tests/eucjp_encoding.phpt
+++ b/ext/mbstring/tests/eucjp_encoding.phpt
@@ -43,6 +43,9 @@ $fromUnicode["\x00\x00\x00\x7E"] = "\x7E";
/* Likewise with 0x005C */
$fromUnicode["\x00\x00\x00\x5C"] = "\x5C";
+/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
+$fromUnicode["\x00\x00\x20\x3E"] = "\xA1\xB1";
+
findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA1, 0xFE), 2) + array(0x8E => 2, 0x8F => 3));
/* In the JIS X 0212 character set, kuten code 0x2237 (EUC-JP 0x8FA2B7)
diff --git a/ext/mbstring/tests/sjismac_encoding.phpt b/ext/mbstring/tests/sjismac_encoding.phpt
index 2dedfa7970..e8b09d266f 100644
--- a/ext/mbstring/tests/sjismac_encoding.phpt
+++ b/ext/mbstring/tests/sjismac_encoding.phpt
@@ -62,6 +62,9 @@ $fromUnicode["\x00\x7F"] = "\x7F";
* and U+2015 */
$fromUnicode["\x20\x15"] = "\x81\x5C";
+/* Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) */
+$fromUnicode["\x20\x3E"] = "\x81\x50";
+
testAllValidChars($validChars, 'SJIS-mac', 'UTF-32BE');
echo "MacJapanese verification and conversion works on all valid characters\n";