diff options
Diffstat (limited to 'ext/standard/html_tables/html_table_gen.php')
-rw-r--r-- | ext/standard/html_tables/html_table_gen.php | 812 |
1 files changed, 812 insertions, 0 deletions
diff --git a/ext/standard/html_tables/html_table_gen.php b/ext/standard/html_tables/html_table_gen.php new file mode 100644 index 0000000..7e7314f --- /dev/null +++ b/ext/standard/html_tables/html_table_gen.php @@ -0,0 +1,812 @@ +<?php +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-2010 The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Gustavo Lopes <cataphract@php.net> | + +----------------------------------------------------------------------+ +*/ + +/* This file prints to stdout the contents of ext/standard/html_tables.h */ +/* put together with glue; have patience */ + +$t = <<<CODE +/* + +----------------------------------------------------------------------+ + | PHP Version 5 | + +----------------------------------------------------------------------+ + | Copyright (c) 1997-%s The PHP Group | + +----------------------------------------------------------------------+ + | This source file is subject to version 3.01 of the PHP license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.php.net/license/3_01.txt | + | If you did not receive a copy of the PHP license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@php.net so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ +*/ + +/* \$Id$ */ + +#ifndef HTML_TABLES_H +#define HTML_TABLES_H + +/************************************************************************** +*************************************************************************** +** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. ** +*************************************************************************** +** Please change html_tables/html_table_gen.php instead and then ** +** run it in order to generate this file ** +*************************************************************************** +**************************************************************************/ + +enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251, + cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5, + cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp, + cs_numelems /* used to count the number of charsets */ + }; +#define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1) +#define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5) +#define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5) + +static const struct { + const char *codeset; + enum entity_charset charset; +} charset_map[] = { + { "ISO-8859-1", cs_8859_1 }, + { "ISO8859-1", cs_8859_1 }, + { "ISO-8859-15", cs_8859_15 }, + { "ISO8859-15", cs_8859_15 }, + { "utf-8", cs_utf_8 }, + { "cp1252", cs_cp1252 }, + { "Windows-1252", cs_cp1252 }, + { "1252", cs_cp1252 }, + { "BIG5", cs_big5 }, + { "950", cs_big5 }, + { "GB2312", cs_gb2312 }, + { "936", cs_gb2312 }, + { "BIG5-HKSCS", cs_big5hkscs }, + { "Shift_JIS", cs_sjis }, + { "SJIS", cs_sjis }, + { "932", cs_sjis }, + { "EUCJP", cs_eucjp }, + { "EUC-JP", cs_eucjp }, + { "KOI8-R", cs_koi8r }, + { "koi8-ru", cs_koi8r }, + { "koi8r", cs_koi8r }, + { "cp1251", cs_cp1251 }, + { "Windows-1251", cs_cp1251 }, + { "win-1251", cs_cp1251 }, + { "iso8859-5", cs_8859_5 }, + { "iso-8859-5", cs_8859_5 }, + { "cp866", cs_cp866 }, + { "866", cs_cp866 }, + { "ibm866", cs_cp866 }, + { "MacRoman", cs_macroman }, + { NULL } +}; + +/* longest entity name length excluding & and ; */ +#define LONGEST_ENTITY_LENGTH 31 + +/* Definitions for mappings *to* Unicode. + * The origin charset must have at most 256 code points. + * The multi-byte encodings are not supported */ +typedef struct { + unsigned short uni_cp[64]; +} enc_to_uni_stage2; + +typedef struct { + const enc_to_uni_stage2 *inner[4]; +} enc_to_uni; + +/* bits 7-8 bits (only single bytes encodings supported )*/ +#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6) +/* bits 1-6 */ +#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F) + + +CODE; + +echo sprintf($t, date("Y")); + +$encodings = array( + array( + "ident" => "iso88591", + "enumid" => 1, + "name" => "ISO-8859-1", + "file" => "mappings/8859-1.TXT", + ), + array( + "ident" => "iso88595", + "enumid" => 5, + "name" => "ISO-8859-5", + "file" => "mappings/8859-5.TXT", + ), + array( + "ident" => "iso885915", + "enumid" => 3, + "name" => "ISO-8859-15", + "file" => "mappings/8859-15.TXT", + ), + array( + "ident" => "win1252", + "enumid" => 2, + "enumident" => "cp1252", + "name" => "Windows-1252", + "file" => "mappings/CP1252.TXT", + ), + array( + "ident" => "win1251", + "enumid" => 4, + "enumident" => "cp1252", + "name" => "Windows-1251", + "file" => "mappings/CP1251.TXT", + ), + array( + "ident" => "koi8r", + "enumid" => 8, + "name" => "KOI8-R", + "file" => "mappings/KOI8-R.TXT", + ), + array( + "ident" => "cp866", + "enumid" => 6, + "name" => "CP-866", + "file" => "mappings/CP866.TXT", + ), + array( + "ident" => "macroman", + "enumid" => 7, + "name" => "MacRoman", + "file" => "mappings/ROMAN.TXT", + ), +); + +$prevStage2 = array(); + +foreach ($encodings as $e) { + echo +"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n"; + + /* process file */ + $map = array(); + $lines = explode("\n", file_get_contents($e{'file'})); + foreach ($lines as $l) { + if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches)) + $map[] = array($matches[1], $matches[2]); + } + + $mappy = array(); + foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); } + + $mstable = array("ident" => $e['ident']); + /* calculate two-stage tables */ + for ($i = 0; $i < 4; $i++) { + for ($j = 0; $j < 64; $j++) { + $cp = $i << 6 | $j; + $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL; + } + } + + echo +"/* {{{ Stage 2 tables for {$e['name']} */\n\n"; + + $s2tables_idents = array(); + for ($i = 0; $i < 4; $i++) { + if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) { + $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"]; + continue; + } + + $s2tables_idents[$i] = $e["ident"]; + + echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_". + sprintf("%02X", $i << 6)." = { {\n"; + for ($j = 0; $j < 64; $j++) { + if ($j == 0) echo "\t"; + elseif ($j % 6 == 0) echo "\n\t"; + else echo " "; + if ($mstable[$i][$j] !== NULL) + echo sprintf("0x%04X,", $mstable[$i][$j]); + else + echo "0xFFFF,"; /* special value; indicates no mapping */ + } + echo "\n} };\n\n"; + + $prevStage2[] = $mstable[$i]; + } + + echo +"/* end of stage 2 tables for {$e['name']} }}} */\n\n"; + + echo +"/* {{{ Stage 1 table for {$e['name']} */\n"; + + echo +"static const enc_to_uni enc_to_uni_{$e['ident']} = { { +\t&enc_to_uni_s2_{$s2tables_idents[0]}_00, +\t&enc_to_uni_s2_{$s2tables_idents[1]}_40, +\t&enc_to_uni_s2_{$s2tables_idents[2]}_80, +\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 } +}; +"; + + echo +"/* end of stage 1 table for {$e['name']} }}} */\n\n"; +} + +$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings)); +$a = range(0, $maxencnum); +foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; } + + echo +"/* {{{ Index of tables for encoding conversion */ +static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n"; + +foreach ($a as $k => $v) { + if (is_numeric($v)) + echo "\tNULL,\n"; + else + echo "\t&enc_to_uni_$v,\n"; +} + + echo +"}; +/* }}} */\n"; + +$t = <<<CODE + +/* Definitions for mappings *from* Unicode */ + +typedef struct { + unsigned short un_code_point; /* we don't need bigger */ + unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ +} uni_to_enc; + + +CODE; + +echo $t; + +$encodings = array( + array( + "ident" => "iso885915", + "name" => "ISO-8859-15", + "file" => "mappings/8859-15.TXT", + "range" => array(0xA4, 0xBE), + ), + array( + "ident" => "win1252", + "name" => "Windows-1252", + "file" => "mappings/CP1252.TXT", + "range" => array(0x80, 0x9F), + ), + array( + "ident" => "win1251", + "name" => "Windows-1251", + "file" => "mappings/CP1251.TXT", + "range" => array(0x80, 0xFF), + ), + array( + "ident" => "koi8r", + "name" => "KOI8-R", + "file" => "mappings/KOI8-R.TXT", + "range" => array(0x80, 0xFF), + ), + array( + "ident" => "cp866", + "name" => "CP-866", + "file" => "mappings/CP866.TXT", + "range" => array(0x80, 0xFF), + ), + array( + "ident" => "macroman", + "name" => "MacRoman", + "file" => "mappings/ROMAN.TXT", + "range" => array(0x80, 0xFF), + ), +); + +foreach ($encodings as $e) { + echo +"/* {{{ Mappings *from* Unicode for {$e['name']} */\n"; + + /* process file */ + $map = array(); + $lines = explode("\n", file_get_contents($e{'file'})); + foreach ($lines as $l) { + if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches)) + $map[] = array($matches[1], $matches[2], rtrim($matches[3])); + } + + $mappy = array(); + foreach ($map as $v) { + if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1]) + $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2])); + } + ksort($mappy); + + echo +"static const uni_to_enc unimap_{$e['ident']}[] = {\n"; + + foreach ($mappy as $k => $v) { + echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ", + $v[1], " */\n"; + } + echo "};\n"; + + echo +"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n"; +} + +$data = file_get_contents("ents_html5.txt"); +$pass2 = false; +$name = "HTML5"; +$ident = "html5"; +again: + +$t = <<<'CODE' +/* HTML 5 has many more named entities. + * Some of them map to two unicode code points, not one. + * We're going to use a three-stage table (with an extra one for the entities + * with two code points). */ + +#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */ +#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6) +#define ENT_STAGE3_INDEX(k) ((k) & 0x3F) +#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k)) + +/* Table should be organized with a leading row telling the size of + * the table and the default entity (maybe NULL) and the rest being + * normal rows ordered by code point so that we can do a binary search */ +typedef union { + struct { + unsigned size; /* number of remaining entries in the table */ + const char *default_entity; + unsigned short default_entity_len; + } leading_entry; + struct { + unsigned second_cp; /* second code point */ + const char *entity; + unsigned short entity_len; + } normal_entry; +} entity_multicodepoint_row; + +/* blocks of these should start at code points k where k % 0xFC0 == 0 */ +typedef struct { + char ambiguous; /* if 0 look into entity */ + union { + struct { + const char *entity; /* may be NULL */ + unsigned short entity_len; + } ent; + const entity_multicodepoint_row *multicodepoint_table; + } data; +} entity_stage3_row; + +/* Calculate k & 0x3F Use as offset */ +typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */ + +/* Calculate k & 0xFC0 >> 6. Use as offset */ +typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */ + +/* For stage 1, Calculate k & 0xFFF000 >> 3*4. + * If larger than 1D, we have no mapping. Otherwise lookup that index */ + +typedef struct { + const entity_stage1_row *ms_table; + /* for tables with only basic entities, this member is to be accessed + * directly for better performance: */ + const entity_stage3_row *table; +} entity_table_opt; + +/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */ + + +CODE; + +if (!$pass2) + echo $t; + +$dp = array(); + +foreach (explode("\n", $data) as $l) { + if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) { + //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]); + $dp[] = array($matches[1], $matches[2], $matches[3]); + } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) { + $dp[] = array($matches[1], $matches[2]); + } +} + +$origdp = $dp; + +usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); }); + +$multicp_rows = array(); +foreach ($dp as $el) { + if (count($el) == 3) { + $multicp_rows[$el[1]] = array(); + } +} + +foreach ($dp as $el) { + if (key_exists($el[1], $multicp_rows)) { + if (count($el) == 3) + $multicp_rows[$el[1]][$el[2]] = $el[0]; + else + $multicp_rows[$el[1]]["default"] = $el[0]; + } +} + +if ($pass2 < 2) + echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n"; +else + echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n"; + +if (empty($multicp_rows)) + goto skip_multicp; + +ksort($multicp_rows); +foreach ($multicp_rows as &$v) { ksort($v); } + +echo +"/* {{{ Start of double code point tables for $name */", "\n\n"; + +foreach ($multicp_rows as $k => $v) { + echo "static const entity_multicodepoint_row multi_cp_{$ident}_", + sprintf("%05s", $k), "[] = {", "\n"; + if (key_exists("default", $v)) { + if ($v['default'] == 'GT') /* hack to make > translate to > not GT; */ + $v['default'] = "gt"; + echo "\t{ {", sprintf("%02d", count($v) - 1), + ",\t\t", sprintf("\"%-21s", $v["default"].'",'), "\t", + sprintf("% 2d", strlen($v["default"])), '} },', "\n"; + } else { + echo "\t{ {", sprintf("%02d", count($v)), + ",\t\t", sprintf("%-22s", 'NULL'), ",\t0} },\n"; + } + unset($v["default"]); + foreach ($v as $l => $w) { + echo "\t{ {", sprintf("0x%05s", $l), ",\t", sprintf("\"%-21s", $w.'",'), "\t", + sprintf("% 2d", strlen($w)), '} },', "\n"; + } + echo "};\n"; +} +echo "\n/* End of double code point tables }}} */", "\n\n"; + +skip_multicp: + +if ($pass2 < 2) + echo "/* {{{ Stage 3 Tables for $name */", "\n\n"; + +$t = <<<CODE +static const entity_stage3_row empty_stage3_table[] = { + /* 64 elements */ + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, + {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, +}; + +CODE; + +if (!$pass2) + echo $t; + +$mstable = array(); +foreach ($dp as $el) { + $s1 = (hexdec($el[1]) & 0xFFF000) >> 12; + $s2 = (hexdec($el[1]) & 0xFC0) >> 6; + $s3 = hexdec($el[1]) & 0x3F; + if (key_exists($el[1], $multicp_rows)) { + $mstable[$s1][$s2][$s3] = ""; + } else { + $mstable[$s1][$s2][$s3] = $el[0]; + } +} + +for ($i = 0; $i < 0x1E; $i++) { + for ($k = 0; $k < 64; $k++) { + $any3 = false; + $col3 = array(); + for ($l = 0; $l < 64; $l++) { + if (isset($mstable[$i][$k][$l])) { + $any3 = true; + $col3[$l] = $mstable[$i][$k][$l]; + } else { + $col3[$l] = null; + } + } + if ($any3) { + echo "static const entity_stage3_row stage3_table_{$ident}_", + sprintf("%02X%03X", $i, $k << 6), "[] = {\n"; + foreach ($col3 as $y => $z) { + if ($y == 0) echo "\t"; + elseif ($y % 4 == 0) echo "\n\t"; + else echo " "; + if ($z === NULL) + echo "{0, { {NULL, 0} } },"; + elseif ($z === "QUOT") /* hack to translate " into "e;, not " */ + echo "{0, { {\"quot\", 4} } },"; + elseif ($z !== "") + echo "{0, { {\"$z\", ", strlen($z), "} } },"; + else + echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X", + ($i << 12) | ($k << 6) | $y ), "} } },"; + + } + echo "\n};\n\n"; + } + } +} + +if ($pass2 < 2) + echo "/* end of stage 3 Tables for $name }}} */", "\n\n"; + +if ($pass2 > 1) + goto hashtables; + +echo +"/* {{{ Stage 2 Tables for $name */", "\n\n"; + +$t = <<<CODE +static const entity_stage2_row empty_stage2_table[] = { + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, + empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, +}; + +CODE; + +if (!$pass2) + echo $t; + +for ($i = 0; $i < 0x1E; $i++) { + $any = false; + for ($k = 0; $k < 64; $k++) { + if (isset($mstable[$i][$k])) + $any = true; + } + if ($any) { + echo "static const entity_stage2_row stage2_table_{$ident}_", + sprintf("%02X000", $i), "[] = {\n"; + for ($k = 0; $k < 64; $k++) { + if ($k == 0) echo "\t"; + elseif ($k % 4 == 0) echo "\n\t"; + else echo " "; + if (isset($mstable[$i][$k])) { + echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ","; + } else { + echo "empty_stage3_table", ","; + } + } + echo "\n};\n\n"; + } +} + +echo +"/* end of stage 2 tables for $name }}} */", "\n\n"; + +echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n"; +for ($i = 0; $i < 0x1E; $i++) { + if (isset($mstable[$i])) + echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n"; + else + echo "\tempty_stage2_table,\n"; +} +echo "};\n\n"; + +echo +"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n"; + +/* commented-out; this enabled binary search, which turned out to be + * significantly slower than the hash tables for html 5 entities */ +//echo +//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n"; + +//$t = <<<CODE +//typedef struct { +// const char *entity; +// unsigned short entity_len; +// unsigned int codepoint1; +// unsigned int codepoint2; +//} entity_cp_map; +// +//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \ +// ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) ) +// +//static const entity_cp_map html5_ent_cp_map[] = { +// +//CODE; +//echo $t; +// +//$dp = $origdp; +//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]); +// return $d==0?strcmp($a[0], $b[0]):$d; }); +// +//$k = 0; +//foreach ($dp as $o) { +// if ($k == 0) echo "\t"; +// elseif ($k % 3 == 0) echo "\n\t"; +// else echo " "; +// if (isset($o[2])) +// echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]), +// hexdec($o[1]), hexdec($o[2])); +// else +// echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]), +// hexdec($o[1])); +// +// if (isset($o[2])) { +// $entlen = strlen($o[0]) + 2; +// $utf8len = strlen( +// mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES")); +// if ($utf8len > $entlen*1.2) { +// die("violated assumption for traverse_for_entities"); +// } +// } +// +// $k++; +//} +//echo "\n};\n\n"; +// +//echo "static const size_t html5_ent_cp_map_size = $k;\n\n"; +// +//echo +//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n"; + +hashtables: + +echo +"/* {{{ $name hash table for entity -> codepoint */", "\n\n"; + +$t = <<<CODE +typedef struct { + const char *entity; + unsigned short entity_len; + unsigned int codepoint1; + unsigned int codepoint2; +} entity_cp_map; + +typedef const entity_cp_map *entity_ht_bucket; + +typedef struct { + unsigned num_elems; /* power of 2 */ + const entity_ht_bucket *buckets; /* .num_elems elements */ +} entity_ht; + +static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} }; + +CODE; + +if (!$pass2) + echo $t; + +function hashfun($str) +{ + + $hash = 5381; + $nKeyLength = strlen($str); + $pos = 0; + + for (; $nKeyLength > 0; $nKeyLength--) { + $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++])) + & 0xFFFFFFFF; + } + return $hash; + +} + +$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16); +$mask = $numelems - 1; +$hashes = array(); +foreach ($origdp as $e) { + $hashes[hashfun($e[0]) & $mask][] = $e; + if (isset($e[2])) { + $entlen = strlen($e[0]) + 2; + $utf8len = strlen( + mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES")); + if ($utf8len > $entlen*1.2) { + die("violated assumption for traverse_for_entities"); + } + } +} + +for ($i = 0; $i < $numelems; $i++) { + if (empty($hashes[$i])) + continue; + echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {"; + foreach ($hashes[$i] as $h) { + if (isset($h[2])) { + echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},', + $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2])); + } else { + echo sprintf(' {"%s", %d, 0x%05X, 0},', + $h[0], strlen($h[0]), hexdec($h[1])); + } + } + echo " {NULL, 0, 0, 0} };\n"; +} +echo "\n"; + +echo +"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n"; + +for ($i = 0; $i < $numelems; $i++) { + if ($i == 0) echo "\t"; + elseif ($i % 4 == 0) echo "\n\t"; + else echo " "; + if (empty($hashes[$i])) + echo "ht_bucket_empty,"; + else + echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ","; +} +echo "\n};\n\n"; + +echo +"static const entity_ht ent_ht_{$ident} = { + ", sprintf("0x%X", $numelems), ", + ht_buckets_{$ident} +};\n\n"; + +echo +"/* end of $name hash table for entity -> codepoint }}} */\n\n"; + +if (!$pass2) { + $data = file_get_contents("ents_html401.txt"); + $pass2 = 1; + $name = "HTML 4.01"; + $ident = "html4"; + goto again; +} elseif ($pass2 == 1) { + $data = file_get_contents("ents_basic.txt"); + $pass2 = 2; + $name = "Basic entities (no apos)"; + $ident = "be_noapos"; + goto again; +} elseif ($pass2 == 2) { + $data = file_get_contents("ents_basic_apos.txt"); + $pass2 = 3; + $name = "Basic entities (with apos)"; + $ident = "be_apos"; + goto again; +} + +echo "#endif /* HTML_TABLES_H */\n"; |