summaryrefslogtreecommitdiff
path: root/ext/standard/html_tables/html_table_gen.php
diff options
context:
space:
mode:
Diffstat (limited to 'ext/standard/html_tables/html_table_gen.php')
-rw-r--r--ext/standard/html_tables/html_table_gen.php812
1 files changed, 812 insertions, 0 deletions
diff --git a/ext/standard/html_tables/html_table_gen.php b/ext/standard/html_tables/html_table_gen.php
new file mode 100644
index 0000000..7e7314f
--- /dev/null
+++ b/ext/standard/html_tables/html_table_gen.php
@@ -0,0 +1,812 @@
+<?php
+/*
+ +----------------------------------------------------------------------+
+ | PHP Version 5 |
+ +----------------------------------------------------------------------+
+ | Copyright (c) 1997-2010 The PHP Group |
+ +----------------------------------------------------------------------+
+ | This source file is subject to version 3.01 of the PHP license, |
+ | that is bundled with this package in the file LICENSE, and is |
+ | available through the world-wide-web at the following url: |
+ | http://www.php.net/license/3_01.txt |
+ | If you did not receive a copy of the PHP license and are unable to |
+ | obtain it through the world-wide-web, please send a note to |
+ | license@php.net so we can mail you a copy immediately. |
+ +----------------------------------------------------------------------+
+ | Authors: Gustavo Lopes <cataphract@php.net> |
+ +----------------------------------------------------------------------+
+*/
+
+/* This file prints to stdout the contents of ext/standard/html_tables.h */
+/* put together with glue; have patience */
+
+$t = <<<CODE
+/*
+ +----------------------------------------------------------------------+
+ | PHP Version 5 |
+ +----------------------------------------------------------------------+
+ | Copyright (c) 1997-%s The PHP Group |
+ +----------------------------------------------------------------------+
+ | This source file is subject to version 3.01 of the PHP license, |
+ | that is bundled with this package in the file LICENSE, and is |
+ | available through the world-wide-web at the following url: |
+ | http://www.php.net/license/3_01.txt |
+ | If you did not receive a copy of the PHP license and are unable to |
+ | obtain it through the world-wide-web, please send a note to |
+ | license@php.net so we can mail you a copy immediately. |
+ +----------------------------------------------------------------------+
+*/
+
+/* \$Id$ */
+
+#ifndef HTML_TABLES_H
+#define HTML_TABLES_H
+
+/**************************************************************************
+***************************************************************************
+** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. **
+***************************************************************************
+** Please change html_tables/html_table_gen.php instead and then **
+** run it in order to generate this file **
+***************************************************************************
+**************************************************************************/
+
+enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
+ cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
+ cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
+ cs_numelems /* used to count the number of charsets */
+ };
+#define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1)
+#define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5)
+#define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5)
+
+static const struct {
+ const char *codeset;
+ enum entity_charset charset;
+} charset_map[] = {
+ { "ISO-8859-1", cs_8859_1 },
+ { "ISO8859-1", cs_8859_1 },
+ { "ISO-8859-15", cs_8859_15 },
+ { "ISO8859-15", cs_8859_15 },
+ { "utf-8", cs_utf_8 },
+ { "cp1252", cs_cp1252 },
+ { "Windows-1252", cs_cp1252 },
+ { "1252", cs_cp1252 },
+ { "BIG5", cs_big5 },
+ { "950", cs_big5 },
+ { "GB2312", cs_gb2312 },
+ { "936", cs_gb2312 },
+ { "BIG5-HKSCS", cs_big5hkscs },
+ { "Shift_JIS", cs_sjis },
+ { "SJIS", cs_sjis },
+ { "932", cs_sjis },
+ { "EUCJP", cs_eucjp },
+ { "EUC-JP", cs_eucjp },
+ { "KOI8-R", cs_koi8r },
+ { "koi8-ru", cs_koi8r },
+ { "koi8r", cs_koi8r },
+ { "cp1251", cs_cp1251 },
+ { "Windows-1251", cs_cp1251 },
+ { "win-1251", cs_cp1251 },
+ { "iso8859-5", cs_8859_5 },
+ { "iso-8859-5", cs_8859_5 },
+ { "cp866", cs_cp866 },
+ { "866", cs_cp866 },
+ { "ibm866", cs_cp866 },
+ { "MacRoman", cs_macroman },
+ { NULL }
+};
+
+/* longest entity name length excluding & and ; */
+#define LONGEST_ENTITY_LENGTH 31
+
+/* Definitions for mappings *to* Unicode.
+ * The origin charset must have at most 256 code points.
+ * The multi-byte encodings are not supported */
+typedef struct {
+ unsigned short uni_cp[64];
+} enc_to_uni_stage2;
+
+typedef struct {
+ const enc_to_uni_stage2 *inner[4];
+} enc_to_uni;
+
+/* bits 7-8 bits (only single bytes encodings supported )*/
+#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
+/* bits 1-6 */
+#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
+
+
+CODE;
+
+echo sprintf($t, date("Y"));
+
+$encodings = array(
+ array(
+ "ident" => "iso88591",
+ "enumid" => 1,
+ "name" => "ISO-8859-1",
+ "file" => "mappings/8859-1.TXT",
+ ),
+ array(
+ "ident" => "iso88595",
+ "enumid" => 5,
+ "name" => "ISO-8859-5",
+ "file" => "mappings/8859-5.TXT",
+ ),
+ array(
+ "ident" => "iso885915",
+ "enumid" => 3,
+ "name" => "ISO-8859-15",
+ "file" => "mappings/8859-15.TXT",
+ ),
+ array(
+ "ident" => "win1252",
+ "enumid" => 2,
+ "enumident" => "cp1252",
+ "name" => "Windows-1252",
+ "file" => "mappings/CP1252.TXT",
+ ),
+ array(
+ "ident" => "win1251",
+ "enumid" => 4,
+ "enumident" => "cp1252",
+ "name" => "Windows-1251",
+ "file" => "mappings/CP1251.TXT",
+ ),
+ array(
+ "ident" => "koi8r",
+ "enumid" => 8,
+ "name" => "KOI8-R",
+ "file" => "mappings/KOI8-R.TXT",
+ ),
+ array(
+ "ident" => "cp866",
+ "enumid" => 6,
+ "name" => "CP-866",
+ "file" => "mappings/CP866.TXT",
+ ),
+ array(
+ "ident" => "macroman",
+ "enumid" => 7,
+ "name" => "MacRoman",
+ "file" => "mappings/ROMAN.TXT",
+ ),
+);
+
+$prevStage2 = array();
+
+foreach ($encodings as $e) {
+ echo
+"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
+
+ /* process file */
+ $map = array();
+ $lines = explode("\n", file_get_contents($e{'file'}));
+ foreach ($lines as $l) {
+ if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
+ $map[] = array($matches[1], $matches[2]);
+ }
+
+ $mappy = array();
+ foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
+
+ $mstable = array("ident" => $e['ident']);
+ /* calculate two-stage tables */
+ for ($i = 0; $i < 4; $i++) {
+ for ($j = 0; $j < 64; $j++) {
+ $cp = $i << 6 | $j;
+ $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
+ }
+ }
+
+ echo
+"/* {{{ Stage 2 tables for {$e['name']} */\n\n";
+
+ $s2tables_idents = array();
+ for ($i = 0; $i < 4; $i++) {
+ if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
+ $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
+ continue;
+ }
+
+ $s2tables_idents[$i] = $e["ident"];
+
+ echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
+ sprintf("%02X", $i << 6)." = { {\n";
+ for ($j = 0; $j < 64; $j++) {
+ if ($j == 0) echo "\t";
+ elseif ($j % 6 == 0) echo "\n\t";
+ else echo " ";
+ if ($mstable[$i][$j] !== NULL)
+ echo sprintf("0x%04X,", $mstable[$i][$j]);
+ else
+ echo "0xFFFF,"; /* special value; indicates no mapping */
+ }
+ echo "\n} };\n\n";
+
+ $prevStage2[] = $mstable[$i];
+ }
+
+ echo
+"/* end of stage 2 tables for {$e['name']} }}} */\n\n";
+
+ echo
+"/* {{{ Stage 1 table for {$e['name']} */\n";
+
+ echo
+"static const enc_to_uni enc_to_uni_{$e['ident']} = { {
+\t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
+\t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
+\t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
+\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
+};
+";
+
+ echo
+"/* end of stage 1 table for {$e['name']} }}} */\n\n";
+}
+
+$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
+$a = range(0, $maxencnum);
+foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
+
+ echo
+"/* {{{ Index of tables for encoding conversion */
+static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
+
+foreach ($a as $k => $v) {
+ if (is_numeric($v))
+ echo "\tNULL,\n";
+ else
+ echo "\t&enc_to_uni_$v,\n";
+}
+
+ echo
+"};
+/* }}} */\n";
+
+$t = <<<CODE
+
+/* Definitions for mappings *from* Unicode */
+
+typedef struct {
+ unsigned short un_code_point; /* we don't need bigger */
+ unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
+} uni_to_enc;
+
+
+CODE;
+
+echo $t;
+
+$encodings = array(
+ array(
+ "ident" => "iso885915",
+ "name" => "ISO-8859-15",
+ "file" => "mappings/8859-15.TXT",
+ "range" => array(0xA4, 0xBE),
+ ),
+ array(
+ "ident" => "win1252",
+ "name" => "Windows-1252",
+ "file" => "mappings/CP1252.TXT",
+ "range" => array(0x80, 0x9F),
+ ),
+ array(
+ "ident" => "win1251",
+ "name" => "Windows-1251",
+ "file" => "mappings/CP1251.TXT",
+ "range" => array(0x80, 0xFF),
+ ),
+ array(
+ "ident" => "koi8r",
+ "name" => "KOI8-R",
+ "file" => "mappings/KOI8-R.TXT",
+ "range" => array(0x80, 0xFF),
+ ),
+ array(
+ "ident" => "cp866",
+ "name" => "CP-866",
+ "file" => "mappings/CP866.TXT",
+ "range" => array(0x80, 0xFF),
+ ),
+ array(
+ "ident" => "macroman",
+ "name" => "MacRoman",
+ "file" => "mappings/ROMAN.TXT",
+ "range" => array(0x80, 0xFF),
+ ),
+);
+
+foreach ($encodings as $e) {
+ echo
+"/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
+
+ /* process file */
+ $map = array();
+ $lines = explode("\n", file_get_contents($e{'file'}));
+ foreach ($lines as $l) {
+ if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
+ $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
+ }
+
+ $mappy = array();
+ foreach ($map as $v) {
+ if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
+ $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
+ }
+ ksort($mappy);
+
+ echo
+"static const uni_to_enc unimap_{$e['ident']}[] = {\n";
+
+ foreach ($mappy as $k => $v) {
+ echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
+ $v[1], " */\n";
+ }
+ echo "};\n";
+
+ echo
+"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
+}
+
+$data = file_get_contents("ents_html5.txt");
+$pass2 = false;
+$name = "HTML5";
+$ident = "html5";
+again:
+
+$t = <<<'CODE'
+/* HTML 5 has many more named entities.
+ * Some of them map to two unicode code points, not one.
+ * We're going to use a three-stage table (with an extra one for the entities
+ * with two code points). */
+
+#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
+#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
+#define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
+#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
+
+/* Table should be organized with a leading row telling the size of
+ * the table and the default entity (maybe NULL) and the rest being
+ * normal rows ordered by code point so that we can do a binary search */
+typedef union {
+ struct {
+ unsigned size; /* number of remaining entries in the table */
+ const char *default_entity;
+ unsigned short default_entity_len;
+ } leading_entry;
+ struct {
+ unsigned second_cp; /* second code point */
+ const char *entity;
+ unsigned short entity_len;
+ } normal_entry;
+} entity_multicodepoint_row;
+
+/* blocks of these should start at code points k where k % 0xFC0 == 0 */
+typedef struct {
+ char ambiguous; /* if 0 look into entity */
+ union {
+ struct {
+ const char *entity; /* may be NULL */
+ unsigned short entity_len;
+ } ent;
+ const entity_multicodepoint_row *multicodepoint_table;
+ } data;
+} entity_stage3_row;
+
+/* Calculate k & 0x3F Use as offset */
+typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
+
+/* Calculate k & 0xFC0 >> 6. Use as offset */
+typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
+
+/* For stage 1, Calculate k & 0xFFF000 >> 3*4.
+ * If larger than 1D, we have no mapping. Otherwise lookup that index */
+
+typedef struct {
+ const entity_stage1_row *ms_table;
+ /* for tables with only basic entities, this member is to be accessed
+ * directly for better performance: */
+ const entity_stage3_row *table;
+} entity_table_opt;
+
+/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
+
+
+CODE;
+
+if (!$pass2)
+ echo $t;
+
+$dp = array();
+
+foreach (explode("\n", $data) as $l) {
+ if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
+ //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]);
+ $dp[] = array($matches[1], $matches[2], $matches[3]);
+ } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
+ $dp[] = array($matches[1], $matches[2]);
+ }
+}
+
+$origdp = $dp;
+
+usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
+
+$multicp_rows = array();
+foreach ($dp as $el) {
+ if (count($el) == 3) {
+ $multicp_rows[$el[1]] = array();
+ }
+}
+
+foreach ($dp as $el) {
+ if (key_exists($el[1], $multicp_rows)) {
+ if (count($el) == 3)
+ $multicp_rows[$el[1]][$el[2]] = $el[0];
+ else
+ $multicp_rows[$el[1]]["default"] = $el[0];
+ }
+}
+
+if ($pass2 < 2)
+ echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
+else
+ echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
+
+if (empty($multicp_rows))
+ goto skip_multicp;
+
+ksort($multicp_rows);
+foreach ($multicp_rows as &$v) { ksort($v); }
+
+echo
+"/* {{{ Start of double code point tables for $name */", "\n\n";
+
+foreach ($multicp_rows as $k => $v) {
+ echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
+ sprintf("%05s", $k), "[] = {", "\n";
+ if (key_exists("default", $v)) {
+ if ($v['default'] == 'GT') /* hack to make > translate to &gt; not GT; */
+ $v['default'] = "gt";
+ echo "\t{ {", sprintf("%02d", count($v) - 1),
+ ",\t\t", sprintf("\"%-21s", $v["default"].'",'), "\t",
+ sprintf("% 2d", strlen($v["default"])), '} },', "\n";
+ } else {
+ echo "\t{ {", sprintf("%02d", count($v)),
+ ",\t\t", sprintf("%-22s", 'NULL'), ",\t0} },\n";
+ }
+ unset($v["default"]);
+ foreach ($v as $l => $w) {
+ echo "\t{ {", sprintf("0x%05s", $l), ",\t", sprintf("\"%-21s", $w.'",'), "\t",
+ sprintf("% 2d", strlen($w)), '} },', "\n";
+ }
+ echo "};\n";
+}
+echo "\n/* End of double code point tables }}} */", "\n\n";
+
+skip_multicp:
+
+if ($pass2 < 2)
+ echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
+
+$t = <<<CODE
+static const entity_stage3_row empty_stage3_table[] = {
+ /* 64 elements */
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+ {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
+};
+
+CODE;
+
+if (!$pass2)
+ echo $t;
+
+$mstable = array();
+foreach ($dp as $el) {
+ $s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
+ $s2 = (hexdec($el[1]) & 0xFC0) >> 6;
+ $s3 = hexdec($el[1]) & 0x3F;
+ if (key_exists($el[1], $multicp_rows)) {
+ $mstable[$s1][$s2][$s3] = "";
+ } else {
+ $mstable[$s1][$s2][$s3] = $el[0];
+ }
+}
+
+for ($i = 0; $i < 0x1E; $i++) {
+ for ($k = 0; $k < 64; $k++) {
+ $any3 = false;
+ $col3 = array();
+ for ($l = 0; $l < 64; $l++) {
+ if (isset($mstable[$i][$k][$l])) {
+ $any3 = true;
+ $col3[$l] = $mstable[$i][$k][$l];
+ } else {
+ $col3[$l] = null;
+ }
+ }
+ if ($any3) {
+ echo "static const entity_stage3_row stage3_table_{$ident}_",
+ sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
+ foreach ($col3 as $y => $z) {
+ if ($y == 0) echo "\t";
+ elseif ($y % 4 == 0) echo "\n\t";
+ else echo " ";
+ if ($z === NULL)
+ echo "{0, { {NULL, 0} } },";
+ elseif ($z === "QUOT") /* hack to translate " into &quote;, not &QUOT; */
+ echo "{0, { {\"quot\", 4} } },";
+ elseif ($z !== "")
+ echo "{0, { {\"$z\", ", strlen($z), "} } },";
+ else
+ echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
+ ($i << 12) | ($k << 6) | $y ), "} } },";
+
+ }
+ echo "\n};\n\n";
+ }
+ }
+}
+
+if ($pass2 < 2)
+ echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
+
+if ($pass2 > 1)
+ goto hashtables;
+
+echo
+"/* {{{ Stage 2 Tables for $name */", "\n\n";
+
+$t = <<<CODE
+static const entity_stage2_row empty_stage2_table[] = {
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+ empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
+};
+
+CODE;
+
+if (!$pass2)
+ echo $t;
+
+for ($i = 0; $i < 0x1E; $i++) {
+ $any = false;
+ for ($k = 0; $k < 64; $k++) {
+ if (isset($mstable[$i][$k]))
+ $any = true;
+ }
+ if ($any) {
+ echo "static const entity_stage2_row stage2_table_{$ident}_",
+ sprintf("%02X000", $i), "[] = {\n";
+ for ($k = 0; $k < 64; $k++) {
+ if ($k == 0) echo "\t";
+ elseif ($k % 4 == 0) echo "\n\t";
+ else echo " ";
+ if (isset($mstable[$i][$k])) {
+ echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
+ } else {
+ echo "empty_stage3_table", ",";
+ }
+ }
+ echo "\n};\n\n";
+ }
+}
+
+echo
+"/* end of stage 2 tables for $name }}} */", "\n\n";
+
+echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
+for ($i = 0; $i < 0x1E; $i++) {
+ if (isset($mstable[$i]))
+ echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
+ else
+ echo "\tempty_stage2_table,\n";
+}
+echo "};\n\n";
+
+echo
+"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
+
+/* commented-out; this enabled binary search, which turned out to be
+ * significantly slower than the hash tables for html 5 entities */
+//echo
+//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n";
+
+//$t = <<<CODE
+//typedef struct {
+// const char *entity;
+// unsigned short entity_len;
+// unsigned int codepoint1;
+// unsigned int codepoint2;
+//} entity_cp_map;
+//
+//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \
+// ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) )
+//
+//static const entity_cp_map html5_ent_cp_map[] = {
+//
+//CODE;
+//echo $t;
+//
+//$dp = $origdp;
+//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]);
+// return $d==0?strcmp($a[0], $b[0]):$d; });
+//
+//$k = 0;
+//foreach ($dp as $o) {
+// if ($k == 0) echo "\t";
+// elseif ($k % 3 == 0) echo "\n\t";
+// else echo " ";
+// if (isset($o[2]))
+// echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]),
+// hexdec($o[1]), hexdec($o[2]));
+// else
+// echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]),
+// hexdec($o[1]));
+//
+// if (isset($o[2])) {
+// $entlen = strlen($o[0]) + 2;
+// $utf8len = strlen(
+// mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES"));
+// if ($utf8len > $entlen*1.2) {
+// die("violated assumption for traverse_for_entities");
+// }
+// }
+//
+// $k++;
+//}
+//echo "\n};\n\n";
+//
+//echo "static const size_t html5_ent_cp_map_size = $k;\n\n";
+//
+//echo
+//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n";
+
+hashtables:
+
+echo
+"/* {{{ $name hash table for entity -> codepoint */", "\n\n";
+
+$t = <<<CODE
+typedef struct {
+ const char *entity;
+ unsigned short entity_len;
+ unsigned int codepoint1;
+ unsigned int codepoint2;
+} entity_cp_map;
+
+typedef const entity_cp_map *entity_ht_bucket;
+
+typedef struct {
+ unsigned num_elems; /* power of 2 */
+ const entity_ht_bucket *buckets; /* .num_elems elements */
+} entity_ht;
+
+static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
+
+CODE;
+
+if (!$pass2)
+ echo $t;
+
+function hashfun($str)
+{
+
+ $hash = 5381;
+ $nKeyLength = strlen($str);
+ $pos = 0;
+
+ for (; $nKeyLength > 0; $nKeyLength--) {
+ $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
+ & 0xFFFFFFFF;
+ }
+ return $hash;
+
+}
+
+$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
+$mask = $numelems - 1;
+$hashes = array();
+foreach ($origdp as $e) {
+ $hashes[hashfun($e[0]) & $mask][] = $e;
+ if (isset($e[2])) {
+ $entlen = strlen($e[0]) + 2;
+ $utf8len = strlen(
+ mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
+ if ($utf8len > $entlen*1.2) {
+ die("violated assumption for traverse_for_entities");
+ }
+ }
+}
+
+for ($i = 0; $i < $numelems; $i++) {
+ if (empty($hashes[$i]))
+ continue;
+ echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
+ foreach ($hashes[$i] as $h) {
+ if (isset($h[2])) {
+ echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
+ $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
+ } else {
+ echo sprintf(' {"%s", %d, 0x%05X, 0},',
+ $h[0], strlen($h[0]), hexdec($h[1]));
+ }
+ }
+ echo " {NULL, 0, 0, 0} };\n";
+}
+echo "\n";
+
+echo
+"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
+
+for ($i = 0; $i < $numelems; $i++) {
+ if ($i == 0) echo "\t";
+ elseif ($i % 4 == 0) echo "\n\t";
+ else echo " ";
+ if (empty($hashes[$i]))
+ echo "ht_bucket_empty,";
+ else
+ echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
+}
+echo "\n};\n\n";
+
+echo
+"static const entity_ht ent_ht_{$ident} = {
+ ", sprintf("0x%X", $numelems), ",
+ ht_buckets_{$ident}
+};\n\n";
+
+echo
+"/* end of $name hash table for entity -> codepoint }}} */\n\n";
+
+if (!$pass2) {
+ $data = file_get_contents("ents_html401.txt");
+ $pass2 = 1;
+ $name = "HTML 4.01";
+ $ident = "html4";
+ goto again;
+} elseif ($pass2 == 1) {
+ $data = file_get_contents("ents_basic.txt");
+ $pass2 = 2;
+ $name = "Basic entities (no apos)";
+ $ident = "be_noapos";
+ goto again;
+} elseif ($pass2 == 2) {
+ $data = file_get_contents("ents_basic_apos.txt");
+ $pass2 = 3;
+ $name = "Basic entities (with apos)";
+ $ident = "be_apos";
+ goto again;
+}
+
+echo "#endif /* HTML_TABLES_H */\n";