Update 'East Asian Width' table to comply with Unicode 13.0

Instead of manually maintaining the data in eaw_table.h, it is now automatically generated by ucgendat/ucgendat.php, using the EastAsianWidth.txt file from the Unicode Consortium. Something must be said about the deleted test case. Back in 2004, someone noticed that `mb_strwidth` didn't comply with Unicode 4.0. A test case was added to expose the problem. Well, time keeps moving on, and with the changing years, new Unicodes are born and old Unicodes die. Some characters which were counted as double-width in Unicode 4.0 are no longer such in Unicode 13.0, which renders the test case obsolete. At the same time, make a couple of spelling/grammar fixes in ucgendat.php.
author: Alex Dowad <alexinbeijing@gmail.com> 2020-09-24 10:40:49 +0200
committer: Alex Dowad <alexinbeijing@gmail.com> 2021-01-19 20:38:44 +0200
commit: d8c785b894e1a4ed9793d71cad02330cb0034faa (patch)
tree: b5a56a63f00e081770ffd0f5e48d85b6d310d649
parent: 28fa0b6365958030ad9697d926652ab74098d17f (diff)
download: php-git-d8c785b894e1a4ed9793d71cad02330cb0034faa.tar.gz
4 files changed, 189 insertions, 54 deletions
diff --git a/ext/mbstring/libmbfl/mbfl/eaw_table.h b/ext/mbstring/libmbfl/mbfl/eaw_table.h
index 4959454451..b7ec454d00 100644
--- a/ext/mbstring/libmbfl/mbfl/eaw_table.h
+++ b/ext/mbstring/libmbfl/mbfl/eaw_table.h
@@ -1,4 +1,8 @@
-/* East Asian Width table
+/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
+ *
+ *                     DO NOT EDIT THIS FILE!
+ *
+ * East Asian Width table
  *
  * Some characters in East Asian languages are intended to be displayed in a space
  * which is roughly square. (This contrasts with others such as the Latin alphabet,
@@ -7,16 +11,48 @@
  * when doing things like wrapping text to a specific width.
  *
  * Each pair of numbers in the below table is a range of Unicode codepoints
- * which should be displayed as double-width. */
+ * which should be displayed as double-width.
+ */
 
 static const struct {
 	int begin;
 	int end;
 } mbfl_eaw_table[] = {
 	{ 0x1100, 0x115f },
-	{ 0x11a3, 0x11a7 },
-	{ 0x11fa, 0x11ff },
+	{ 0x231a, 0x231b },
 	{ 0x2329, 0x232a },
+	{ 0x23e9, 0x23ec },
+	{ 0x23f0, 0x23f0 },
+	{ 0x23f3, 0x23f3 },
+	{ 0x25fd, 0x25fe },
+	{ 0x2614, 0x2615 },
+	{ 0x2648, 0x2653 },
+	{ 0x267f, 0x267f },
+	{ 0x2693, 0x2693 },
+	{ 0x26a1, 0x26a1 },
+	{ 0x26aa, 0x26ab },
+	{ 0x26bd, 0x26be },
+	{ 0x26c4, 0x26c5 },
+	{ 0x26ce, 0x26ce },
+	{ 0x26d4, 0x26d4 },
+	{ 0x26ea, 0x26ea },
+	{ 0x26f2, 0x26f3 },
+	{ 0x26f5, 0x26f5 },
+	{ 0x26fa, 0x26fa },
+	{ 0x26fd, 0x26fd },
+	{ 0x2705, 0x2705 },
+	{ 0x270a, 0x270b },
+	{ 0x2728, 0x2728 },
+	{ 0x274c, 0x274c },
+	{ 0x274e, 0x274e },
+	{ 0x2753, 0x2755 },
+	{ 0x2757, 0x2757 },
+	{ 0x2795, 0x2797 },
+	{ 0x27b0, 0x27b0 },
+	{ 0x27bf, 0x27bf },
+	{ 0x2b1b, 0x2b1c },
+	{ 0x2b50, 0x2b50 },
+	{ 0x2b55, 0x2b55 },
 	{ 0x2e80, 0x2e99 },
 	{ 0x2e9b, 0x2ef3 },
 	{ 0x2f00, 0x2fd5 },
@@ -24,20 +60,16 @@ static const struct {
 	{ 0x3000, 0x303e },
 	{ 0x3041, 0x3096 },
 	{ 0x3099, 0x30ff },
-	{ 0x3105, 0x312d },
+	{ 0x3105, 0x312f },
 	{ 0x3131, 0x318e },
-	{ 0x3190, 0x31ba },
-	{ 0x31c0, 0x31e3 },
+	{ 0x3190, 0x31e3 },
 	{ 0x31f0, 0x321e },
 	{ 0x3220, 0x3247 },
-	{ 0x3250, 0x32fe },
-	{ 0x3300, 0x4dbf },
+	{ 0x3250, 0x4dbf },
 	{ 0x4e00, 0xa48c },
 	{ 0xa490, 0xa4c6 },
 	{ 0xa960, 0xa97c },
 	{ 0xac00, 0xd7a3 },
-	{ 0xd7b0, 0xd7c6 },
-	{ 0xd7cb, 0xd7fb },
 	{ 0xf900, 0xfaff },
 	{ 0xfe10, 0xfe19 },
 	{ 0xfe30, 0xfe52 },
@@ -45,11 +77,61 @@ static const struct {
 	{ 0xfe68, 0xfe6b },
 	{ 0xff01, 0xff60 },
 	{ 0xffe0, 0xffe6 },
-	{ 0x1b000, 0x1b001 },
+	{ 0x16fe0, 0x16fe4 },
+	{ 0x16ff0, 0x16ff1 },
+	{ 0x17000, 0x187f7 },
+	{ 0x18800, 0x18cd5 },
+	{ 0x18d00, 0x18d08 },
+	{ 0x1b000, 0x1b11e },
+	{ 0x1b150, 0x1b152 },
+	{ 0x1b164, 0x1b167 },
+	{ 0x1b170, 0x1b2fb },
+	{ 0x1f004, 0x1f004 },
+	{ 0x1f0cf, 0x1f0cf },
+	{ 0x1f18e, 0x1f18e },
+	{ 0x1f191, 0x1f19a },
 	{ 0x1f200, 0x1f202 },
-	{ 0x1f210, 0x1f23a },
+	{ 0x1f210, 0x1f23b },
 	{ 0x1f240, 0x1f248 },
 	{ 0x1f250, 0x1f251 },
+	{ 0x1f260, 0x1f265 },
+	{ 0x1f300, 0x1f320 },
+	{ 0x1f32d, 0x1f335 },
+	{ 0x1f337, 0x1f37c },
+	{ 0x1f37e, 0x1f393 },
+	{ 0x1f3a0, 0x1f3ca },
+	{ 0x1f3cf, 0x1f3d3 },
+	{ 0x1f3e0, 0x1f3f0 },
+	{ 0x1f3f4, 0x1f3f4 },
+	{ 0x1f3f8, 0x1f43e },
+	{ 0x1f440, 0x1f440 },
+	{ 0x1f442, 0x1f4fc },
+	{ 0x1f4ff, 0x1f53d },
+	{ 0x1f54b, 0x1f54e },
+	{ 0x1f550, 0x1f567 },
+	{ 0x1f57a, 0x1f57a },
+	{ 0x1f595, 0x1f596 },
+	{ 0x1f5a4, 0x1f5a4 },
+	{ 0x1f5fb, 0x1f64f },
+	{ 0x1f680, 0x1f6c5 },
+	{ 0x1f6cc, 0x1f6cc },
+	{ 0x1f6d0, 0x1f6d2 },
+	{ 0x1f6d5, 0x1f6d7 },
+	{ 0x1f6eb, 0x1f6ec },
+	{ 0x1f6f4, 0x1f6fc },
+	{ 0x1f7e0, 0x1f7eb },
+	{ 0x1f90c, 0x1f93a },
+	{ 0x1f93c, 0x1f945 },
+	{ 0x1f947, 0x1f978 },
+	{ 0x1f97a, 0x1f9cb },
+	{ 0x1f9cd, 0x1f9ff },
+	{ 0x1fa70, 0x1fa74 },
+	{ 0x1fa78, 0x1fa7a },
+	{ 0x1fa80, 0x1fa86 },
+	{ 0x1fa90, 0x1faa8 },
+	{ 0x1fab0, 0x1fab6 },
+	{ 0x1fac0, 0x1fac2 },
+	{ 0x1fad0, 0x1fad6 },
 	{ 0x20000, 0x2fffd },
-	{ 0x30000, 0x3fffd }
+	{ 0x30000, 0x3fffd },
 };
diff --git a/ext/mbstring/tests/bug28220.phpt b/ext/mbstring/tests/bug28220.phpt
deleted file mode 100644
index 6845dc3ed9..0000000000
--- a/ext/mbstring/tests/bug28220.phpt
+++ /dev/null
@@ -1,25 +0,0 @@
---TEST--
-Bug #28220 (mb_strwidth() returns wrong width values for some Hangul characters)
---SKIPIF--
-<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
---FILE--
-<?php
-$coderange = array(
-    range(0x0000, 0x1fff),
-    range(0xff60, 0xff9f)
-);
-
-
-foreach ($coderange as $r) {
-    $ng = 0;
-    foreach ($r as $c) {
-        if (mb_strwidth(pack('N1', $c), 'UCS-4BE') != 2) {
-            $ng++;
-        }
-    }
-    echo "$ng\n";
-}
-?>
---EXPECT--
-8085
-63
diff --git a/ext/mbstring/ucgendat/ucgendat.php b/ext/mbstring/ucgendat/ucgendat.php
index 8901a587b3..d1e887589f 100755
--- a/ext/mbstring/ucgendat/ucgendat.php
+++ b/ext/mbstring/ucgendat/ucgendat.php
@@ -4,10 +4,10 @@
 /**
  * This is based on the ucgendat.c file from the OpenLDAP project, licensed as
  * follows. This file is not necessary to build PHP. It's only necessary to
- * rebuild unicode_data.h from Unicode ucd files.
+ * rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
  *
  * Example usage:
- * php ucgendat.php UnicodeData.txt
+ * php ucgendat.php path/to/Unicode/data/files
  */
 
 /* Copyright 1998-2007 The OpenLDAP Foundation.
@@ -45,7 +45,7 @@
 if ($argc < 2) {
     echo "Usage: php ucgendata.php ./datadir\n";
     echo "./datadir must contain:\n";
-    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n";
+    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
     return;
 }
 
@@ -54,8 +54,9 @@ $unicodeDataFile = $dir . '/UnicodeData.txt';
 $caseFoldingFile = $dir . '/CaseFolding.txt';
 $specialCasingFile = $dir . '/SpecialCasing.txt';
 $derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
+$eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
 
-$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile];
+$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
 foreach ($files as $file) {
     if (!file_exists($file)) {
         echo "File $file does not exist.\n";
@@ -72,6 +73,11 @@ parseSpecialCasing($data, file_get_contents($specialCasingFile));
 parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
 file_put_contents($outputFile, generateData($data));
 
+$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
+
+$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
+file_put_contents($eawFile, generateEastAsianWidthData($eawData));
+
 class Range {
     public $start;
     public $end;
@@ -372,6 +378,43 @@ function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
     }
 }
 
+function parseEastAsianWidth(string $input) : array {
+    $wideRanges = [];
+
+    foreach (parseDataFile($input) as $fields) {
+        if ($fields[1] == 'W' || $fields[1] == 'F') {
+            if ($dotsPos = strpos($fields[0], '..')) {
+                $startCode = intval(substr($fields[0], 0, $dotsPos), 16);
+                $endCode = intval(substr($fields[0], $dotsPos + 2), 16);
+
+                if (!empty($wideRanges)) {
+                    $lastRange = $wideRanges[count($wideRanges) - 1];
+                    if ($startCode == $lastRange->end + 1) {
+                        $lastRange->end = $endCode;
+                        continue;
+                    }
+                }
+
+                $wideRanges[] = new Range($startCode, $endCode);
+            } else {
+                $code = intval($fields[0], 16);
+
+                if (!empty($wideRanges)) {
+                    $lastRange = $wideRanges[count($wideRanges) - 1];
+                    if ($code == $lastRange->end + 1) {
+                        $lastRange->end++;
+                        continue;
+                    }
+                }
+
+                $wideRanges[] = new Range($code, $code);
+            }
+        }
+    }
+
+    return $wideRanges;
+}
+
 function formatArray(array $values, int $width, string $format) : string {
     $result = '';
     $i = 0;
@@ -412,7 +455,7 @@ function generatePropData(UnicodeData $data) {
     $propOffsets[] = $idx;
 
     // TODO ucgendat.c pads the prop offsets to the next multiple of 4
-    // for rather debious reasons of alignment. This should probably be
+    // for rather dubious reasons of alignment. This should probably be
     // dropped
     while (count($propOffsets) % 4 != 0) {
         $propOffsets[] = 0;
@@ -509,17 +552,17 @@ function generateCaseData(UnicodeData $data) {
 
 function generateData(UnicodeData $data) {
     $result = <<<'HEADER'
-/* This file was generated from a modified version UCData's ucgendat.
+/* This file was generated from a modified version of UCData's ucgendat.
  *
  *                     DO NOT EDIT THIS FILE!
  *
- * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
- * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
- * files from  http://www.unicode.org/Public/ and run this program.
+ * Instead, download the appropriate UnicodeData-x.x.x.txt and
+ * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
+ * and run ext/mbstring/ucgendat/ucgendat.php.
  *
  * More information can be found in the UCData package. Unfortunately,
  * the project's page doesn't seem to be live anymore, so you can use
- * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
+ * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
 HEADER;
     $result .= "\n\n" . generatePropData($data);
     $result .= generateCaseData($data);
@@ -646,3 +689,38 @@ function generateMPH(array $map, bool $fast) {
 
     return $mph;
 }
+
+function generateEastAsianWidthData(array $wideRanges) {
+      $result = <<<'HEADER'
+/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
+ *
+ *                     DO NOT EDIT THIS FILE!
+ *
+ * East Asian Width table
+ *
+ * Some characters in East Asian languages are intended to be displayed in a space
+ * which is roughly square. (This contrasts with others such as the Latin alphabet,
+ * which are taller than they are wide.) To display these East Asian characters
+ * properly, twice the horizontal space is used. This must be taken into account
+ * when doing things like wrapping text to a specific width.
+ *
+ * Each pair of numbers in the below table is a range of Unicode codepoints
+ * which should be displayed as double-width.
+ */
+
+static const struct {
+	int begin;
+	int end;
+} mbfl_eaw_table[] = {
+
+HEADER;
+
+    foreach ($wideRanges as $range) {
+        $startCode = dechex($range->start);
+        $endCode = dechex($range->end);
+        $result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n";
+    }
+
+    $result .= "};\n";
+    return $result;
+}
diff --git a/ext/mbstring/unicode_data.h b/ext/mbstring/unicode_data.h
index d0bc23aa10..12ee3c7478 100644
--- a/ext/mbstring/unicode_data.h
+++ b/ext/mbstring/unicode_data.h
@@ -1,14 +1,14 @@
-/* This file was generated from a modified version UCData's ucgendat.
+/* This file was generated from a modified version of UCData's ucgendat.
  *
  *                     DO NOT EDIT THIS FILE!
  *
- * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
- * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
- * files from  http://www.unicode.org/Public/ and run this program.
+ * Instead, download the appropriate UnicodeData-x.x.x.txt and
+ * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
+ * and run ext/mbstring/ucgendat/ucgendat.php.
  *
  * More information can be found in the UCData package. Unfortunately,
  * the project's page doesn't seem to be live anymore, so you can use
- * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
+ * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
 
 static const unsigned short _ucprop_size = 44;
author	Alex Dowad <alexinbeijing@gmail.com>	2020-09-24 10:40:49 +0200
committer	Alex Dowad <alexinbeijing@gmail.com>	2021-01-19 20:38:44 +0200
commit	d8c785b894e1a4ed9793d71cad02330cb0034faa (patch)
tree	b5a56a63f00e081770ffd0f5e48d85b6d310d649
parent	28fa0b6365958030ad9697d926652ab74098d17f (diff)
download	php-git-d8c785b894e1a4ed9793d71cad02330cb0034faa.tar.gz