summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Dowad <alexinbeijing@gmail.com>2020-11-14 23:43:28 +0200
committerAlex Dowad <alexinbeijing@gmail.com>2020-11-25 20:51:45 +0200
commitc9fea7db728ae0a22bf6f903fcbbea9468f222a4 (patch)
tree82f9bb7aa97df285ed146679fb4ac775f776fab4
parentecf718470b78c2fddc9d07cf1215a27773264515 (diff)
downloadphp-git-c9fea7db728ae0a22bf6f903fcbbea9468f222a4.tar.gz
Convert U+00AF (MACRON) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants
Except for vanilla Shift-JIS, where 0x7E is a halfwidth overline/macron. As for Shift-JIS-2004, it has an added character (byte sequence 0x854A) which was defined as a halfwidth macron in JIS X 0213:2000, so we use that.
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c4
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_sjis.c2
-rw-r--r--ext/mbstring/libmbfl/filters/unicode_table_jis.h2
-rw-r--r--ext/mbstring/tests/cp51932_encoding.phpt2
-rw-r--r--ext/mbstring/tests/cp932_encoding.phpt3
-rw-r--r--ext/mbstring/tests/sjis_encoding.phpt2
-rw-r--r--ext/mbstring/tests/sjismac_encoding.phpt2
7 files changed, 14 insertions, 3 deletions
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c
index 507a26a5b1..25ce6c92bc 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c
@@ -194,7 +194,9 @@ mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter)
{
int s = 0;
- if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
+ if (c == 0xAF) { /* U+00AF is MACRON */
+ s = 0xA2B4; /* Use JIS X 0212 overline */
+ } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c
index 455c49cb9a..bde382a6d3 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c
@@ -211,7 +211,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
/* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or
* macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */
s1 = 0x2141;
- } else if (c == 0x203E) { /* U+203E is OVERLINE */
+ } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
s1 = 0x7E; /* Halfwidth overline/macron */
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
diff --git a/ext/mbstring/libmbfl/filters/unicode_table_jis.h b/ext/mbstring/libmbfl/filters/unicode_table_jis.h
index 640c5587d8..04e6a63b9e 100644
--- a/ext/mbstring/libmbfl/filters/unicode_table_jis.h
+++ b/ext/mbstring/libmbfl/filters/unicode_table_jis.h
@@ -2303,7 +2303,7 @@ const unsigned short ucs_a1_jis_table[] = {
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,
0x0000,0xA2C2,0x2171,0x2172,0xA2F0,0x0000,0xA2C3,0x2178,
- 0x212F,0xA2ED,0xA2EC,0x0000,0x224C,0x0000,0xA2EE,0xA2B4,
+ 0x212F,0xA2ED,0xA2EC,0x0000,0x224C,0x0000,0xA2EE,0x2131,
0x216B,0x215E,0x0000,0x0000,0x212D,0x0000,0x2279,0x0000,
0xA2B1,0x0000,0xA2EB,0x0000,0x0000,0x0000,0x0000,0xA2C4,
0xAAA2,0xAAA1,0xAAA4,0xAAAA,0xAAA3,0xAAA9,0xA9A1,0xAAAE,
diff --git a/ext/mbstring/tests/cp51932_encoding.phpt b/ext/mbstring/tests/cp51932_encoding.phpt
index 5bf9ffe42e..bf7b60bcc3 100644
--- a/ext/mbstring/tests/cp51932_encoding.phpt
+++ b/ext/mbstring/tests/cp51932_encoding.phpt
@@ -88,6 +88,8 @@ for ($i = 0; $i <= 0x7F; $i++)
$fromUnicode["\x00\xA5"] = "\xA1\xEF";
/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
$fromUnicode["\x20\x3E"] = "\xA1\xB1";
+/* U+00AF is MACRON; convert to FULLWIDTH MACRON */
+$fromUnicode["\x00\xAF"] = "\xA1\xB1";
testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);
testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);
diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt
index 25844774de..b426281f24 100644
--- a/ext/mbstring/tests/cp932_encoding.phpt
+++ b/ext/mbstring/tests/cp932_encoding.phpt
@@ -50,6 +50,9 @@ $fromUnicode["\x00\xAC"] = "\x81\xCA";
/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */
$fromUnicode["\x20\x3E"] = "\x81\x50";
+/* U+00AF is MACRON; it can also go to FULLWIDTH MACRON */
+$fromUnicode["\x00\xAF"] = "\x81\x50";
+
findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));
findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
diff --git a/ext/mbstring/tests/sjis_encoding.phpt b/ext/mbstring/tests/sjis_encoding.phpt
index 8ac3b0563e..d7d7d26457 100644
--- a/ext/mbstring/tests/sjis_encoding.phpt
+++ b/ext/mbstring/tests/sjis_encoding.phpt
@@ -24,6 +24,8 @@ $fromUnicode["\x00\x7E"] = "\x81\x60";
/* DEL character */
$validChars["\x7F"] = "\x00\x7F";
$fromUnicode["\x00\x7F"] = "\x7F";
+/* U+00AF is MACRON; Shift-JIS 0x7E is overline */
+$fromUnicode["\x00\xAF"] = "\x7E";
/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
$validChars["\x81\x5F"] = "\xFF\x3C";
$fromUnicode["\xFF\x3C"] = "\x81\x5F";
diff --git a/ext/mbstring/tests/sjismac_encoding.phpt b/ext/mbstring/tests/sjismac_encoding.phpt
index 3c36484f4a..5803f0dc02 100644
--- a/ext/mbstring/tests/sjismac_encoding.phpt
+++ b/ext/mbstring/tests/sjismac_encoding.phpt
@@ -64,6 +64,8 @@ $fromUnicode["\x20\x15"] = "\x81\x5C";
/* Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) */
$fromUnicode["\x20\x3E"] = "\x81\x50";
+/* And also U+00AF (MACRON) */
+$fromUnicode["\x00\xAF"] = "\x81\x50";
/* Convert U+FF5E (FULLWIDTH TILDE) to 0x8160 (WAVE DASH) */
$fromUnicode["\xFF\x5E"] = "\x81\x60";