diff options
author | Richard Maw <richard.maw@codethink.co.uk> | 2012-01-17 14:43:55 +0000 |
---|---|---|
committer | Richard Maw <richard.maw@codethink.co.uk> | 2012-01-17 14:43:55 +0000 |
commit | 2de9abc5c9d40b3c716307d67d16146f823fd554 (patch) | |
tree | 6979db67934ddc8b564150b465846a383b428ff8 /gnulib/lib/gen-uni-tables.c | |
parent | 33cc1c6fda6e72a7bae1401e9b2cec495a4d3ff1 (diff) | |
download | patch-baserock/bootstrap.tar.gz |
add the output of bootstrapbaserock/bootstrap-pass2baserock/bootstrap
Diffstat (limited to 'gnulib/lib/gen-uni-tables.c')
m--------- | gnulib | 0 | ||||
-rw-r--r-- | gnulib/lib/gen-uni-tables.c | 9780 |
2 files changed, 9780 insertions, 0 deletions
diff --git a/gnulib b/gnulib deleted file mode 160000 -Subproject 443bc5ffcf7429e557f4a371b0661abe98ddbc1 diff --git a/gnulib/lib/gen-uni-tables.c b/gnulib/lib/gen-uni-tables.c new file mode 100644 index 0000000..0eddbb1 --- /dev/null +++ b/gnulib/lib/gen-uni-tables.c @@ -0,0 +1,9780 @@ +/* Generate Unicode conforming character classification tables and + line break properties tables and word break property tables and + decomposition/composition and case mapping tables from a UnicodeData file. + Copyright (C) 2000-2002, 2004, 2007-2011 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2000-2002. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +/* Usage example: + $ gen-uni-tables /usr/local/share/Unidata/UnicodeData.txt \ + /usr/local/share/Unidata/PropList.txt \ + /usr/local/share/Unidata/DerivedCoreProperties.txt \ + /usr/local/share/Unidata/ArabicShaping.txt \ + /usr/local/share/Unidata/Scripts.txt \ + /usr/local/share/Unidata/Blocks.txt \ + /usr/local/share/Unidata/PropList-3.0.1.txt \ + /usr/local/share/Unidata/EastAsianWidth.txt \ + /usr/local/share/Unidata/LineBreak.txt \ + /usr/local/share/Unidata/WordBreakProperty.txt \ + /usr/local/share/Unidata/GraphemeBreakProperty.txt \ + /usr/local/share/Unidata/CompositionExclusions.txt \ + /usr/local/share/Unidata/SpecialCasing.txt \ + /usr/local/share/Unidata/CaseFolding.txt \ + 6.0.0 + */ + +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +/* ========================================================================= */ + +/* Reading UnicodeData.txt. */ +/* See UCD.html. */ + +/* This structure represents one line in the UnicodeData.txt file. */ +struct unicode_attribute +{ + const char *name; /* Character name */ + const char *category; /* General category */ + const char *combining; /* Canonical combining class */ + const char *bidi; /* Bidirectional category */ + const char *decomposition; /* Character decomposition mapping */ + const char *decdigit; /* Decimal digit value */ + const char *digit; /* Digit value */ + const char *numeric; /* Numeric value */ + bool mirrored; /* mirrored */ + const char *oldname; /* Old Unicode 1.0 name */ + const char *comment; /* Comment */ + unsigned int upper; /* Uppercase mapping */ + unsigned int lower; /* Lowercase mapping */ + unsigned int title; /* Titlecase mapping */ +}; + +/* Missing fields are represented with "" for strings, and NONE for + characters. */ +#define NONE (~(unsigned int)0) + +/* The entire contents of the UnicodeData.txt file. */ +struct unicode_attribute unicode_attributes [0x110000]; + +/* Stores in unicode_attributes[i] the values from the given fields. */ +static void +fill_attribute (unsigned int i, + const char *field1, const char *field2, + const char *field3, const char *field4, + const char *field5, const char *field6, + const char *field7, const char *field8, + const char *field9, const char *field10, + const char *field11, const char *field12, + const char *field13, const char *field14) +{ + struct unicode_attribute * uni; + + if (i >= 0x110000) + { + fprintf (stderr, "index too large\n"); + exit (1); + } + if (strcmp (field2, "Cs") == 0) + /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */ + return; + uni = &unicode_attributes[i]; + /* Copy the strings. */ + uni->name = strdup (field1); + uni->category = (field2[0] == '\0' ? "" : strdup (field2)); + uni->combining = (field3[0] == '\0' ? "" : strdup (field3)); + uni->bidi = (field4[0] == '\0' ? "" : strdup (field4)); + uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5)); + uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6)); + uni->digit = (field7[0] == '\0' ? "" : strdup (field7)); + uni->numeric = (field8[0] == '\0' ? "" : strdup (field8)); + uni->mirrored = (field9[0] == 'Y'); + uni->oldname = (field10[0] == '\0' ? "" : strdup (field10)); + uni->comment = (field11[0] == '\0' ? "" : strdup (field11)); + uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16)); + uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16)); + uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16)); +} + +/* Maximum length of a field in the UnicodeData.txt file. */ +#define FIELDLEN 120 + +/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN. + Reads up to (but excluding) DELIM. + Returns 1 when a field was successfully read, otherwise 0. */ +static int +getfield (FILE *stream, char *buffer, int delim) +{ + int count = 0; + int c; + + for (; (c = getc (stream)), (c != EOF && c != delim); ) + { + /* The original unicode.org UnicodeData.txt file happens to have + CR/LF line terminators. Silently convert to LF. */ + if (c == '\r') + continue; + + /* Put c into the buffer. */ + if (++count >= FIELDLEN - 1) + { + fprintf (stderr, "field longer than expected, increase FIELDLEN\n"); + exit (1); + } + *buffer++ = c; + } + + if (c == EOF) + return 0; + + *buffer = '\0'; + return 1; +} + +/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt + file. */ +static void +fill_attributes (const char *unicodedata_filename) +{ + unsigned int i, j; + FILE *stream; + char field0[FIELDLEN]; + char field1[FIELDLEN]; + char field2[FIELDLEN]; + char field3[FIELDLEN]; + char field4[FIELDLEN]; + char field5[FIELDLEN]; + char field6[FIELDLEN]; + char field7[FIELDLEN]; + char field8[FIELDLEN]; + char field9[FIELDLEN]; + char field10[FIELDLEN]; + char field11[FIELDLEN]; + char field12[FIELDLEN]; + char field13[FIELDLEN]; + char field14[FIELDLEN]; + int lineno = 0; + + for (i = 0; i < 0x110000; i++) + unicode_attributes[i].name = NULL; + + stream = fopen (unicodedata_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename); + exit (1); + } + + for (;;) + { + int n; + + lineno++; + n = getfield (stream, field0, ';'); + n += getfield (stream, field1, ';'); + n += getfield (stream, field2, ';'); + n += getfield (stream, field3, ';'); + n += getfield (stream, field4, ';'); + n += getfield (stream, field5, ';'); + n += getfield (stream, field6, ';'); + n += getfield (stream, field7, ';'); + n += getfield (stream, field8, ';'); + n += getfield (stream, field9, ';'); + n += getfield (stream, field10, ';'); + n += getfield (stream, field11, ';'); + n += getfield (stream, field12, ';'); + n += getfield (stream, field13, ';'); + n += getfield (stream, field14, '\n'); + if (n == 0) + break; + if (n != 15) + { + fprintf (stderr, "short line in '%s':%d\n", + unicodedata_filename, lineno); + exit (1); + } + i = strtoul (field0, NULL, 16); + if (field1[0] == '<' + && strlen (field1) >= 9 + && strcmp (field1 + strlen (field1) - 8, ", First>") == 0) + { + /* Deal with a range. */ + lineno++; + n = getfield (stream, field0, ';'); + n += getfield (stream, field1, ';'); + n += getfield (stream, field2, ';'); + n += getfield (stream, field3, ';'); + n += getfield (stream, field4, ';'); + n += getfield (stream, field5, ';'); + n += getfield (stream, field6, ';'); + n += getfield (stream, field7, ';'); + n += getfield (stream, field8, ';'); + n += getfield (stream, field9, ';'); + n += getfield (stream, field10, ';'); + n += getfield (stream, field11, ';'); + n += getfield (stream, field12, ';'); + n += getfield (stream, field13, ';'); + n += getfield (stream, field14, '\n'); + if (n != 15) + { + fprintf (stderr, "missing end range in '%s':%d\n", + unicodedata_filename, lineno); + exit (1); + } + if (!(field1[0] == '<' + && strlen (field1) >= 8 + && strcmp (field1 + strlen (field1) - 7, ", Last>") == 0)) + { + fprintf (stderr, "missing end range in '%s':%d\n", + unicodedata_filename, lineno); + exit (1); + } + field1[strlen (field1) - 7] = '\0'; + j = strtoul (field0, NULL, 16); + for (; i <= j; i++) + fill_attribute (i, field1+1, field2, field3, field4, field5, + field6, field7, field8, field9, field10, + field11, field12, field13, field14); + } + else + { + /* Single character line */ + fill_attribute (i, field1, field2, field3, field4, field5, + field6, field7, field8, field9, field10, + field11, field12, field13, field14); + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* General category. */ +/* See Unicode 3.0 book, section 4.5, + UCD.html. */ + +static bool +is_category_L (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L'); +} + +static bool +is_category_LC (unsigned int ch) +{ + /* See PropertyValueAliases.txt. */ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && (unicode_attributes[ch].category[1] == 'u' + || unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 't')); +} + +static bool +is_category_Lu (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'u'); +} + +static bool +is_category_Ll (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'l'); +} + +static bool +is_category_Lt (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 't'); +} + +static bool +is_category_Lm (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'm'); +} + +static bool +is_category_Lo (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'L' + && unicode_attributes[ch].category[1] == 'o'); +} + +static bool +is_category_M (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'M'); +} + +static bool +is_category_Mn (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'M' + && unicode_attributes[ch].category[1] == 'n'); +} + +static bool +is_category_Mc (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'M' + && unicode_attributes[ch].category[1] == 'c'); +} + +static bool +is_category_Me (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'M' + && unicode_attributes[ch].category[1] == 'e'); +} + +static bool +is_category_N (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'N'); +} + +static bool +is_category_Nd (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'd'); +} + +static bool +is_category_Nl (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'l'); +} + +static bool +is_category_No (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'o'); +} + +static bool +is_category_P (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P'); +} + +static bool +is_category_Pc (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'c'); +} + +static bool +is_category_Pd (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'd'); +} + +static bool +is_category_Ps (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 's'); +} + +static bool +is_category_Pe (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'e'); +} + +static bool +is_category_Pi (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'i'); +} + +static bool +is_category_Pf (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'f'); +} + +static bool +is_category_Po (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'o'); +} + +static bool +is_category_S (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'S'); +} + +static bool +is_category_Sm (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'm'); +} + +static bool +is_category_Sc (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'c'); +} + +static bool +is_category_Sk (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'k'); +} + +static bool +is_category_So (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'o'); +} + +static bool +is_category_Z (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z'); +} + +static bool +is_category_Zs (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 's'); +} + +static bool +is_category_Zl (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 'l'); +} + +static bool +is_category_Zp (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 'p'); +} + +static bool +is_category_C (unsigned int ch) +{ + return (unicode_attributes[ch].name == NULL + || unicode_attributes[ch].category[0] == 'C'); +} + +static bool +is_category_Cc (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'C' + && unicode_attributes[ch].category[1] == 'c'); +} + +static bool +is_category_Cf (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'C' + && unicode_attributes[ch].category[1] == 'f'); +} + +static bool +is_category_Cs (unsigned int ch) +{ + return (ch >= 0xd800 && ch < 0xe000); +} + +static bool +is_category_Co (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'C' + && unicode_attributes[ch].category[1] == 'o'); +} + +static bool +is_category_Cn (unsigned int ch) +{ + return (unicode_attributes[ch].name == NULL + && !(ch >= 0xd800 && ch < 0xe000)); +} + +/* Output a boolean property in a human readable format. */ +static void +debug_output_predicate (const char *filename, bool (*predicate) (unsigned int)) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + +#if 0 /* This yields huge text output. */ + for (ch = 0; ch < 0x110000; ch++) + if (predicate (ch)) + { + fprintf (stream, "0x%04X\n", ch); + } +#else + for (ch = 0; ch < 0x110000; ch++) + if (predicate (ch)) + { + unsigned int first = ch; + unsigned int last; + + while (ch + 1 < 0x110000 && predicate (ch + 1)) + ch++; + last = ch; + if (first < last) + fprintf (stream, "0x%04X..0x%04X\n", first, last); + else + fprintf (stream, "0x%04X\n", ch); + } +#endif + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Output the unit test for a boolean property. */ +static void +output_predicate_test (const char *filename, bool (*predicate) (unsigned int), const char *expression) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Test the Unicode character type functions.\n"); + fprintf (stream, " Copyright (C) 2007 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n"); + fprintf (stream, "\n"); + fprintf (stream, "#include \"test-predicate-part1.h\"\n"); + fprintf (stream, "\n"); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + if (predicate (ch)) + { + unsigned int first = ch; + unsigned int last; + + while (ch + 1 < 0x110000 && predicate (ch + 1)) + ch++; + last = ch; + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, 0x%04X }", first, last); + need_comma = true; + } + if (need_comma) + fprintf (stream, "\n"); + + fprintf (stream, "\n"); + fprintf (stream, "#define PREDICATE(c) %s\n", expression); + fprintf (stream, "#include \"test-predicate-part2.h\"\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE predicate_table +#define xmalloc malloc +#define xrealloc realloc +#include "3levelbit.h" + +/* Output a boolean property in a three-level bitmap. */ +static void +output_predicate (const char *filename, bool (*predicate) (unsigned int), const char *name, const char *comment, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct predicate_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* %s of Unicode characters. */\n", comment); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 4; /* or: 5 */ + t.q = 7; /* or: 6 */ + predicate_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + if (predicate (ch)) + predicate_table_add (&t, ch); + + predicate_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + if (i != 1) + fprintf (stream, "#define header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int header[1];\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " /*unsigned*/ int level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "%s =\n", name); + fprintf (stream, "{\n"); + fprintf (stream, " { %d },\n", ((uint32_t *) t.result)[1]); + fprintf (stream, " {"); + if (t.level1_size > 1) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 1) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu * sizeof (int) / sizeof (short) + %5zu", + 1 + t.level1_size, (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 1) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 1) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 1) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu + %5zu * sizeof (short) / sizeof (int) + %5zu", + 1 + t.level1_size, t.level2_size << t.q, (offset - level3_offset) / sizeof (uint32_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 1) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 4) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%08X", + ((uint32_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Output all categories. */ +static void +output_categories (const char *version) +{ +#define CATEGORY(C) \ + debug_output_predicate ("unictype/categ_" #C ".txt", is_category_ ## C); \ + output_predicate_test ("../tests/unictype/test-categ_" #C ".c", is_category_ ## C, "uc_is_general_category (c, UC_CATEGORY_" #C ")"); \ + output_predicate ("unictype/categ_" #C ".h", is_category_ ## C, "u_categ_" #C, "Categories", version); + CATEGORY (L) + CATEGORY (LC) + CATEGORY (Lu) + CATEGORY (Ll) + CATEGORY (Lt) + CATEGORY (Lm) + CATEGORY (Lo) + CATEGORY (M) + CATEGORY (Mn) + CATEGORY (Mc) + CATEGORY (Me) + CATEGORY (N) + CATEGORY (Nd) + CATEGORY (Nl) + CATEGORY (No) + CATEGORY (P) + CATEGORY (Pc) + CATEGORY (Pd) + CATEGORY (Ps) + CATEGORY (Pe) + CATEGORY (Pi) + CATEGORY (Pf) + CATEGORY (Po) + CATEGORY (S) + CATEGORY (Sm) + CATEGORY (Sc) + CATEGORY (Sk) + CATEGORY (So) + CATEGORY (Z) + CATEGORY (Zs) + CATEGORY (Zl) + CATEGORY (Zp) + CATEGORY (C) + CATEGORY (Cc) + CATEGORY (Cf) + CATEGORY (Cs) + CATEGORY (Co) + CATEGORY (Cn) +#undef CATEGORY +} + +enum +{ + UC_CATEGORY_MASK_L = 0x0000001f, + UC_CATEGORY_MASK_LC = 0x00000007, + UC_CATEGORY_MASK_Lu = 0x00000001, + UC_CATEGORY_MASK_Ll = 0x00000002, + UC_CATEGORY_MASK_Lt = 0x00000004, + UC_CATEGORY_MASK_Lm = 0x00000008, + UC_CATEGORY_MASK_Lo = 0x00000010, + UC_CATEGORY_MASK_M = 0x000000e0, + UC_CATEGORY_MASK_Mn = 0x00000020, + UC_CATEGORY_MASK_Mc = 0x00000040, + UC_CATEGORY_MASK_Me = 0x00000080, + UC_CATEGORY_MASK_N = 0x00000700, + UC_CATEGORY_MASK_Nd = 0x00000100, + UC_CATEGORY_MASK_Nl = 0x00000200, + UC_CATEGORY_MASK_No = 0x00000400, + UC_CATEGORY_MASK_P = 0x0003f800, + UC_CATEGORY_MASK_Pc = 0x00000800, + UC_CATEGORY_MASK_Pd = 0x00001000, + UC_CATEGORY_MASK_Ps = 0x00002000, + UC_CATEGORY_MASK_Pe = 0x00004000, + UC_CATEGORY_MASK_Pi = 0x00008000, + UC_CATEGORY_MASK_Pf = 0x00010000, + UC_CATEGORY_MASK_Po = 0x00020000, + UC_CATEGORY_MASK_S = 0x003c0000, + UC_CATEGORY_MASK_Sm = 0x00040000, + UC_CATEGORY_MASK_Sc = 0x00080000, + UC_CATEGORY_MASK_Sk = 0x00100000, + UC_CATEGORY_MASK_So = 0x00200000, + UC_CATEGORY_MASK_Z = 0x01c00000, + UC_CATEGORY_MASK_Zs = 0x00400000, + UC_CATEGORY_MASK_Zl = 0x00800000, + UC_CATEGORY_MASK_Zp = 0x01000000, + UC_CATEGORY_MASK_C = 0x3e000000, + UC_CATEGORY_MASK_Cc = 0x02000000, + UC_CATEGORY_MASK_Cf = 0x04000000, + UC_CATEGORY_MASK_Cs = 0x08000000, + UC_CATEGORY_MASK_Co = 0x10000000, + UC_CATEGORY_MASK_Cn = 0x20000000 +}; + +static int +general_category_byname (const char *category_name) +{ + if (category_name[0] != '\0' + && (category_name[1] == '\0' || category_name[2] == '\0')) + switch (category_name[0]) + { + case 'L': + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_L; + case 'C': return UC_CATEGORY_MASK_LC; + case 'u': return UC_CATEGORY_MASK_Lu; + case 'l': return UC_CATEGORY_MASK_Ll; + case 't': return UC_CATEGORY_MASK_Lt; + case 'm': return UC_CATEGORY_MASK_Lm; + case 'o': return UC_CATEGORY_MASK_Lo; + } + break; + case 'M': + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_M; + case 'n': return UC_CATEGORY_MASK_Mn; + case 'c': return UC_CATEGORY_MASK_Mc; + case 'e': return UC_CATEGORY_MASK_Me; + } + break; + case 'N': + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_N; + case 'd': return UC_CATEGORY_MASK_Nd; + case 'l': return UC_CATEGORY_MASK_Nl; + case 'o': return UC_CATEGORY_MASK_No; + } + break; + case 'P': + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_P; + case 'c': return UC_CATEGORY_MASK_Pc; + case 'd': return UC_CATEGORY_MASK_Pd; + case 's': return UC_CATEGORY_MASK_Ps; + case 'e': return UC_CATEGORY_MASK_Pe; + case 'i': return UC_CATEGORY_MASK_Pi; + case 'f': return UC_CATEGORY_MASK_Pf; + case 'o': return UC_CATEGORY_MASK_Po; + } + break; + case 'S': + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_S; + case 'm': return UC_CATEGORY_MASK_Sm; + case 'c': return UC_CATEGORY_MASK_Sc; + case 'k': return UC_CATEGORY_MASK_Sk; + case 'o': return UC_CATEGORY_MASK_So; + } + break; + case 'Z': + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_Z; + case 's': return UC_CATEGORY_MASK_Zs; + case 'l': return UC_CATEGORY_MASK_Zl; + case 'p': return UC_CATEGORY_MASK_Zp; + } + break; + case 'C': + switch (category_name[1]) + { + case '\0': return UC_CATEGORY_MASK_C; + case 'c': return UC_CATEGORY_MASK_Cc; + case 'f': return UC_CATEGORY_MASK_Cf; + case 's': return UC_CATEGORY_MASK_Cs; + case 'o': return UC_CATEGORY_MASK_Co; + case 'n': return UC_CATEGORY_MASK_Cn; + } + break; + } + /* Invalid category name. */ + abort (); +} + +/* Construction of sparse 3-level tables. */ +#define TABLE category_table +#define ELEMENT uint8_t +#define DEFAULT 29 /* = log2(UC_CATEGORY_MASK_Cn) */ +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output the per-character category table. */ +static void +output_category (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct category_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint16_t *level3_packed; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Categories of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + category_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value; + unsigned int log2_value; + + if (is_category_Cs (ch)) + value = UC_CATEGORY_MASK_Cs; + else if (unicode_attributes[ch].name != NULL) + value = general_category_byname (unicode_attributes[ch].category); + else + continue; + + /* Now value should contain exactly one bit. */ + if (value == 0 || ((value & (value - 1)) != 0)) + abort (); + + for (log2_value = 0; value > 1; value >>= 1, log2_value++); + + category_table_add (&t, ch, log2_value); + } + + category_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define category_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, + (1 << t.p) * 5 / 16); + fprintf (stream, " }\n"); + fprintf (stream, "u_category =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units, + not 32-bit units, in order to make the lookup function easier. */ + level3_packed = + (uint16_t *) + calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t)); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned int j = (i * 5) / 16; + unsigned int k = (i * 5) % 16; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i]; + value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k); + level3_packed[j] = value & 0xffff; + level3_packed[j+1] = value >> 16; + } + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%04x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + free (level3_packed); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Canonical combining class. */ +/* See Unicode 3.0 book, section 4.2, + UCD.html. */ + +/* Construction of sparse 3-level tables. */ +#define TABLE combclass_table +#define ELEMENT uint8_t +#define DEFAULT 0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output the per-character combining class table. */ +static void +output_combclass (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct combclass_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Combining class of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + combclass_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + if (unicode_attributes[ch].name != NULL) + { + int value = atoi (unicode_attributes[ch].combining); + if (!(value >= 0 && value <= 255)) + abort (); + combclass_table_add (&t, ch, value); + } + + combclass_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define combclass_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_combclass =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Bidirectional category. */ +/* See Unicode 3.0 book, section 4.3, + UCD.html. */ + +enum +{ + UC_BIDI_L, /* Left-to-Right */ + UC_BIDI_LRE, /* Left-to-Right Embedding */ + UC_BIDI_LRO, /* Left-to-Right Override */ + UC_BIDI_R, /* Right-to-Left */ + UC_BIDI_AL, /* Right-to-Left Arabic */ + UC_BIDI_RLE, /* Right-to-Left Embedding */ + UC_BIDI_RLO, /* Right-to-Left Override */ + UC_BIDI_PDF, /* Pop Directional Format */ + UC_BIDI_EN, /* European Number */ + UC_BIDI_ES, /* European Number Separator */ + UC_BIDI_ET, /* European Number Terminator */ + UC_BIDI_AN, /* Arabic Number */ + UC_BIDI_CS, /* Common Number Separator */ + UC_BIDI_NSM, /* Non-Spacing Mark */ + UC_BIDI_BN, /* Boundary Neutral */ + UC_BIDI_B, /* Paragraph Separator */ + UC_BIDI_S, /* Segment Separator */ + UC_BIDI_WS, /* Whitespace */ + UC_BIDI_ON /* Other Neutral */ +}; + +static int +bidi_category_byname (const char *category_name) +{ + switch (category_name[0]) + { + case 'A': + switch (category_name[1]) + { + case 'L': + if (category_name[2] == '\0') + return UC_BIDI_AL; + break; + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_AN; + break; + } + break; + case 'B': + switch (category_name[1]) + { + case '\0': + return UC_BIDI_B; + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_BN; + break; + } + break; + case 'C': + switch (category_name[1]) + { + case 'S': + if (category_name[2] == '\0') + return UC_BIDI_CS; + break; + } + break; + case 'E': + switch (category_name[1]) + { + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_EN; + break; + case 'S': + if (category_name[2] == '\0') + return UC_BIDI_ES; + break; + case 'T': + if (category_name[2] == '\0') + return UC_BIDI_ET; + break; + } + break; + case 'L': + switch (category_name[1]) + { + case '\0': + return UC_BIDI_L; + case 'R': + switch (category_name[2]) + { + case 'E': + if (category_name[3] == '\0') + return UC_BIDI_LRE; + break; + case 'O': + if (category_name[3] == '\0') + return UC_BIDI_LRO; + break; + } + break; + } + break; + case 'N': + switch (category_name[1]) + { + case 'S': + switch (category_name[2]) + { + case 'M': + if (category_name[3] == '\0') + return UC_BIDI_NSM; + break; + } + break; + } + break; + case 'O': + switch (category_name[1]) + { + case 'N': + if (category_name[2] == '\0') + return UC_BIDI_ON; + break; + } + break; + case 'P': + switch (category_name[1]) + { + case 'D': + switch (category_name[2]) + { + case 'F': + if (category_name[3] == '\0') + return UC_BIDI_PDF; + break; + } + break; + } + break; + case 'R': + switch (category_name[1]) + { + case '\0': + return UC_BIDI_R; + case 'L': + switch (category_name[2]) + { + case 'E': + if (category_name[3] == '\0') + return UC_BIDI_RLE; + break; + case 'O': + if (category_name[3] == '\0') + return UC_BIDI_RLO; + break; + } + break; + } + break; + case 'S': + if (category_name[1] == '\0') + return UC_BIDI_S; + break; + case 'W': + switch (category_name[1]) + { + case 'S': + if (category_name[2] == '\0') + return UC_BIDI_WS; + break; + } + break; + } + /* Invalid bidi category name. */ + abort (); +} + +static int +get_bidi_category (unsigned int ch) +{ + if (unicode_attributes[ch].name != NULL) + return bidi_category_byname (unicode_attributes[ch].bidi); + else + { + /* The bidi category of unassigned characters depends on the range. + See UTR #9 and DerivedBidiClass.txt. */ + if ((ch >= 0x0590 && ch <= 0x05FF) + || (ch >= 0x07FB && ch <= 0x08FF) + || (ch >= 0xFB37 && ch <= 0xFB45) + || (ch >= 0x10800 && ch <= 0x10FFF)) + return UC_BIDI_R; + else if ((ch >= 0x0600 && ch <= 0x07BF) + || (ch >= 0x2064 && ch <= 0x2069) + || (ch >= 0xFBB2 && ch <= 0xFDCF) + || (ch >= 0xFDFE && ch <= 0xFEFE)) + return UC_BIDI_AL; + else if ((ch >= 0xFDD0 && ch <= 0xFDEF) + || (ch >= 0xFFF0 && ch <= 0xFFFF) + || (ch & 0xFFFF) == 0xFFFE + || (ch & 0xFFFF) == 0xFFFF + || (ch >= 0xE0000 && ch <= 0xE0FFF)) + return UC_BIDI_BN; + else + return UC_BIDI_L; + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE bidi_category_table +#define ELEMENT uint8_t +#define DEFAULT UC_BIDI_L +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output the per-character bidi category table. */ +static void +output_bidi_category (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct bidi_category_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint16_t *level3_packed; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Bidi categories of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + bidi_category_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value = get_bidi_category (ch); + + bidi_category_table_add (&t, ch, value); + } + + bidi_category_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define bidi_category_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, + (1 << t.p) * 5 / 16); + fprintf (stream, " }\n"); + fprintf (stream, "u_bidi_category =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 5 bits only. Use 16-bit units, + not 32-bit units, in order to make the lookup function easier. */ + level3_packed = + (uint16_t *) + calloc ((t.level3_size << t.p) * 5 / 16 + 1, sizeof (uint16_t)); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned int j = (i * 5) / 16; + unsigned int k = (i * 5) % 16; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i]; + value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k); + level3_packed[j] = value & 0xffff; + level3_packed[j+1] = value >> 16; + } + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 5 / 16 + 1; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%04x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 5 / 16 + 1) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 5 / 16 + 1 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + free (level3_packed); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Decimal digit value. */ +/* See Unicode 3.0 book, section 4.6. */ + +static int +get_decdigit_value (unsigned int ch) +{ + if (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].decdigit[0] != '\0') + return atoi (unicode_attributes[ch].decdigit); + return -1; +} + +/* Construction of sparse 3-level tables. */ +#define TABLE decdigit_table +#define ELEMENT uint8_t +#define DEFAULT 0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output the unit test for the per-character decimal digit value table. */ +static void +output_decimal_digit_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = get_decdigit_value (ch); + + if (!(value >= -1 && value < 10)) + abort (); + + if (value >= 0) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %d }", ch, value); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Output the per-character decimal digit value table. */ +static void +output_decimal_digit (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct decdigit_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Decimal digit values of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + decdigit_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value = 1 + get_decdigit_value (ch); + + if (!(value >= 0 && value <= 10)) + abort (); + + decdigit_table_add (&t, ch, value); + } + + decdigit_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define decdigit_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, + t.p - 1); + fprintf (stream, " }\n"); + fprintf (stream, "u_decdigit =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 4 bits only. */ + fprintf (stream, " {"); + if (t.level3_size << (t.p - 1) > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << (t.p - 1); i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x", + ((uint8_t *) (t.result + level3_offset))[2*i] + + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); + if (i+1 < t.level3_size << (t.p - 1)) + fprintf (stream, ","); + } + if (t.level3_size << (t.p - 1) > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Digit value. */ +/* See Unicode 3.0 book, section 4.6. */ + +static int +get_digit_value (unsigned int ch) +{ + if (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].digit[0] != '\0') + return atoi (unicode_attributes[ch].digit); + return -1; +} + +/* Output the unit test for the per-character digit value table. */ +static void +output_digit_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Digit values of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = get_digit_value (ch); + + if (!(value >= -1 && value < 10)) + abort (); + + if (value >= 0) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %d }", ch, value); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Output the per-character digit value table. */ +static void +output_digit (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct decdigit_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Digit values of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + decdigit_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value = 1 + get_digit_value (ch); + + if (!(value >= 0 && value <= 10)) + abort (); + + decdigit_table_add (&t, ch, value); + } + + decdigit_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define digit_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, + t.p - 1); + fprintf (stream, " }\n"); + fprintf (stream, "u_digit =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 4 bits only. */ + fprintf (stream, " {"); + if (t.level3_size << (t.p - 1) > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << (t.p - 1); i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x", + ((uint8_t *) (t.result + level3_offset))[2*i] + + (((uint8_t *) (t.result + level3_offset))[2*i+1] << 4)); + if (i+1 < t.level3_size << (t.p - 1)) + fprintf (stream, ","); + } + if (t.level3_size << (t.p - 1) > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Numeric value. */ +/* See Unicode 3.0 book, section 4.6. */ + +typedef struct { int numerator; int denominator; } uc_fraction_t; + +static uc_fraction_t +get_numeric_value (unsigned int ch) +{ + uc_fraction_t value; + + if (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].numeric[0] != '\0') + { + const char *str = unicode_attributes[ch].numeric; + /* str is of the form "integer" or "integer/posinteger". */ + value.numerator = atoi (str); + if (strchr (str, '/') != NULL) + value.denominator = atoi (strchr (str, '/') + 1); + else + value.denominator = 1; + } + else + { + value.numerator = 0; + value.denominator = 0; + } + return value; +} + +/* Output the unit test for the per-character numeric value table. */ +static void +output_numeric_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Numeric values of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + uc_fraction_t value = get_numeric_value (ch); + + if (value.numerator != 0 || value.denominator != 0) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %d, %d }", + ch, value.numerator, value.denominator); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE numeric_table +#define ELEMENT uint8_t +#define DEFAULT 0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output the per-character numeric value table. */ +static void +output_numeric (const char *filename, const char *version) +{ + FILE *stream; + uc_fraction_t fractions[128]; + unsigned int nfractions; + unsigned int ch, i, j; + struct numeric_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint16_t *level3_packed; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Numeric values of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + /* Create table of occurring fractions. */ + nfractions = 0; + for (ch = 0; ch < 0x110000; ch++) + { + uc_fraction_t value = get_numeric_value (ch); + + for (i = 0; i < nfractions; i++) + if (value.numerator == fractions[i].numerator + && value.denominator == fractions[i].denominator) + break; + if (i == nfractions) + { + if (nfractions == 128) + abort (); + for (i = 0; i < nfractions; i++) + if (value.denominator < fractions[i].denominator + || (value.denominator == fractions[i].denominator + && value.numerator < fractions[i].numerator)) + break; + for (j = nfractions; j > i; j--) + fractions[j] = fractions[j - 1]; + fractions[i] = value; + nfractions++; + } + } + + fprintf (stream, "static const uc_fraction_t u_numeric_values[%d] =\n", + nfractions); + fprintf (stream, "{\n"); + for (i = 0; i < nfractions; i++) + { + fprintf (stream, " { %d, %d }", fractions[i].numerator, + fractions[i].denominator); + if (i+1 < nfractions) + fprintf (stream, ","); + fprintf (stream, "\n"); + } + fprintf (stream, "};\n"); + + t.p = 7; + t.q = 9; + numeric_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + uc_fraction_t value = get_numeric_value (ch); + + for (i = 0; i < nfractions; i++) + if (value.numerator == fractions[i].numerator + && value.denominator == fractions[i].denominator) + break; + if (i == nfractions) + abort (); + + numeric_table_add (&t, ch, i); + } + + numeric_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define numeric_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned short level3[%zu * %d + 1];\n", t.level3_size, + (1 << t.p) * 7 / 16); + fprintf (stream, " }\n"); + fprintf (stream, "u_numeric =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 7 bits only. Use 16-bit units, + not 32-bit units, in order to make the lookup function easier. */ + level3_packed = + (uint16_t *) + calloc ((t.level3_size << t.p) * 7 / 16 + 1, sizeof (uint16_t)); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned int j = (i * 7) / 16; + unsigned int k = (i * 7) % 16; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i]; + value = level3_packed[j] | (level3_packed[j+1] << 16) | (value << k); + level3_packed[j] = value & 0xffff; + level3_packed[j+1] = value >> 16; + } + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 7 / 16 + 1 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 7 / 16 + 1; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%04x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 7 / 16 + 1) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 7 / 16 + 1 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + free (level3_packed); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Mirrored. */ +/* See Unicode 3.0 book, section 4.7, + UAX #9. */ + +/* List of mirrored character pairs. This is a subset of the characters + having the BidiMirrored property. */ +static unsigned int mirror_pairs[][2] = +{ + { 0x0028, 0x0029 }, + { 0x003C, 0x003E }, + { 0x005B, 0x005D }, + { 0x007B, 0x007D }, + { 0x00AB, 0x00BB }, + { 0x2039, 0x203A }, + { 0x2045, 0x2046 }, + { 0x207D, 0x207E }, + { 0x208D, 0x208E }, + { 0x2208, 0x220B }, + { 0x220A, 0x220D }, + { 0x223C, 0x223D }, + { 0x2243, 0x22CD }, + { 0x2252, 0x2253 }, + { 0x2254, 0x2255 }, + { 0x2264, 0x2265 }, + { 0x2266, 0x2267 }, + { 0x226A, 0x226B }, + { 0x2276, 0x2277 }, + { 0x2278, 0x2279 }, + { 0x227A, 0x227B }, + { 0x227C, 0x227D }, + { 0x2282, 0x2283 }, + { 0x2286, 0x2287 }, + { 0x228F, 0x2290 }, + { 0x2291, 0x2292 }, + { 0x22A2, 0x22A3 }, + { 0x22B0, 0x22B1 }, + { 0x22B2, 0x22B3 }, + { 0x22B4, 0x22B5 }, + { 0x22B6, 0x22B7 }, + { 0x22C9, 0x22CA }, + { 0x22CB, 0x22CC }, + { 0x22D0, 0x22D1 }, + { 0x22D6, 0x22D7 }, + { 0x22D8, 0x22D9 }, + { 0x22DA, 0x22DB }, + { 0x22DC, 0x22DD }, + { 0x22DE, 0x22DF }, + { 0x22F0, 0x22F1 }, + { 0x2308, 0x2309 }, + { 0x230A, 0x230B }, + { 0x2329, 0x232A }, + { 0x3008, 0x3009 }, + { 0x300A, 0x300B }, + { 0x300C, 0x300D }, + { 0x300E, 0x300F }, + { 0x3010, 0x3011 }, + { 0x3014, 0x3015 }, + { 0x3016, 0x3017 }, + { 0x3018, 0x3019 }, + { 0x301A, 0x301B } +}; + +static int +get_mirror_value (unsigned int ch) +{ + bool mirrored; + unsigned int mirror_char; + unsigned int i; + + mirrored = (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].mirrored); + mirror_char = 0xfffd; + for (i = 0; i < sizeof (mirror_pairs) / sizeof (mirror_pairs[0]); i++) + if (ch == mirror_pairs[i][0]) + { + mirror_char = mirror_pairs[i][1]; + break; + } + else if (ch == mirror_pairs[i][1]) + { + mirror_char = mirror_pairs[i][0]; + break; + } + if (mirrored) + return (int) mirror_char - (int) ch; + else + { + if (mirror_char != 0xfffd) + abort (); + return 0; + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE mirror_table +#define ELEMENT int32_t +#define DEFAULT 0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output the per-character mirror table. */ +static void +output_mirror (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct mirror_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Mirrored Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + mirror_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value = get_mirror_value (ch); + + mirror_table_add (&t, ch, value); + } + + mirror_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define mirror_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_mirror =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (int32_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Particular values of the word break property. */ + +static bool +is_WBP_MIDNUMLET (unsigned int ch) +{ + return (ch == 0x0027 || ch == 0x002E || ch == 0x2018 || ch == 0x2019 + || ch == 0x2024 || ch == 0xFE52 || ch == 0xFF07 || ch == 0xFF0E); +} + +static bool +is_WBP_MIDLETTER (unsigned int ch) +{ + return (ch == 0x00B7 || ch == 0x05F4 || ch == 0x2027 || ch == 0x003A + || ch == 0x0387 || ch == 0xFE13 || ch == 0xFE55 || ch == 0xFF1A); +} + +/* ========================================================================= */ + +/* Properties. */ + +/* Reading PropList.txt and DerivedCoreProperties.txt. */ +enum +{ + /* PropList.txt */ + PROP_WHITE_SPACE, + PROP_BIDI_CONTROL, + PROP_JOIN_CONTROL, + PROP_DASH, + PROP_HYPHEN, + PROP_QUOTATION_MARK, + PROP_TERMINAL_PUNCTUATION, + PROP_OTHER_MATH, + PROP_HEX_DIGIT, + PROP_ASCII_HEX_DIGIT, + PROP_OTHER_ALPHABETIC, + PROP_IDEOGRAPHIC, + PROP_DIACRITIC, + PROP_EXTENDER, + PROP_OTHER_LOWERCASE, + PROP_OTHER_UPPERCASE, + PROP_NONCHARACTER_CODE_POINT, + PROP_OTHER_GRAPHEME_EXTEND, + PROP_IDS_BINARY_OPERATOR, + PROP_IDS_TRINARY_OPERATOR, + PROP_RADICAL, + PROP_UNIFIED_IDEOGRAPH, + PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT, + PROP_DEPRECATED, + PROP_SOFT_DOTTED, + PROP_LOGICAL_ORDER_EXCEPTION, + PROP_OTHER_ID_START, + PROP_OTHER_ID_CONTINUE, + PROP_STERM, + PROP_VARIATION_SELECTOR, + PROP_PATTERN_WHITE_SPACE, + PROP_PATTERN_SYNTAX, + /* DerivedCoreProperties.txt */ + PROP_MATH, + PROP_ALPHABETIC, + PROP_LOWERCASE, + PROP_UPPERCASE, + PROP_CASED, + PROP_CASE_IGNORABLE, + PROP_CHANGES_WHEN_LOWERCASED, + PROP_CHANGES_WHEN_UPPERCASED, + PROP_CHANGES_WHEN_TITLECASED, + PROP_CHANGES_WHEN_CASEFOLDED, + PROP_CHANGES_WHEN_CASEMAPPED, + PROP_ID_START, + PROP_ID_CONTINUE, + PROP_XID_START, + PROP_XID_CONTINUE, + PROP_DEFAULT_IGNORABLE_CODE_POINT, + PROP_GRAPHEME_EXTEND, + PROP_GRAPHEME_BASE, + PROP_GRAPHEME_LINK +}; +unsigned long long unicode_properties[0x110000]; + +static void +clear_properties (void) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + unicode_properties[i] = 0; +} + +/* Stores in unicode_properties[] the properties from the + PropList.txt or DerivedCoreProperties.txt file. */ +static void +fill_properties (const char *proplist_filename) +{ + unsigned int i; + FILE *stream; + + stream = fopen (proplist_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", proplist_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + unsigned int propvalue; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", proplist_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + /* PropList.txt */ + PROP ("White_Space", PROP_WHITE_SPACE) + PROP ("Bidi_Control", PROP_BIDI_CONTROL) + PROP ("Join_Control", PROP_JOIN_CONTROL) + PROP ("Dash", PROP_DASH) + PROP ("Hyphen", PROP_HYPHEN) + PROP ("Quotation_Mark", PROP_QUOTATION_MARK) + PROP ("Terminal_Punctuation", PROP_TERMINAL_PUNCTUATION) + PROP ("Other_Math", PROP_OTHER_MATH) + PROP ("Hex_Digit", PROP_HEX_DIGIT) + PROP ("ASCII_Hex_Digit", PROP_ASCII_HEX_DIGIT) + PROP ("Other_Alphabetic", PROP_OTHER_ALPHABETIC) + PROP ("Ideographic", PROP_IDEOGRAPHIC) + PROP ("Diacritic", PROP_DIACRITIC) + PROP ("Extender", PROP_EXTENDER) + PROP ("Other_Lowercase", PROP_OTHER_LOWERCASE) + PROP ("Other_Uppercase", PROP_OTHER_UPPERCASE) + PROP ("Noncharacter_Code_Point", PROP_NONCHARACTER_CODE_POINT) + PROP ("Other_Grapheme_Extend", PROP_OTHER_GRAPHEME_EXTEND) + PROP ("IDS_Binary_Operator", PROP_IDS_BINARY_OPERATOR) + PROP ("IDS_Trinary_Operator", PROP_IDS_TRINARY_OPERATOR) + PROP ("Radical", PROP_RADICAL) + PROP ("Unified_Ideograph", PROP_UNIFIED_IDEOGRAPH) + PROP ("Other_Default_Ignorable_Code_Point", PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT) + PROP ("Deprecated", PROP_DEPRECATED) + PROP ("Soft_Dotted", PROP_SOFT_DOTTED) + PROP ("Logical_Order_Exception", PROP_LOGICAL_ORDER_EXCEPTION) + PROP ("Other_ID_Start", PROP_OTHER_ID_START) + PROP ("Other_ID_Continue", PROP_OTHER_ID_CONTINUE) + PROP ("STerm", PROP_STERM) + PROP ("Variation_Selector", PROP_VARIATION_SELECTOR) + PROP ("Pattern_White_Space", PROP_PATTERN_WHITE_SPACE) + PROP ("Pattern_Syntax", PROP_PATTERN_SYNTAX) + /* DerivedCoreProperties.txt */ + PROP ("Math", PROP_MATH) + PROP ("Alphabetic", PROP_ALPHABETIC) + PROP ("Lowercase", PROP_LOWERCASE) + PROP ("Uppercase", PROP_UPPERCASE) + PROP ("Cased", PROP_CASED) + PROP ("Case_Ignorable", PROP_CASE_IGNORABLE) + PROP ("Changes_When_Lowercased", PROP_CHANGES_WHEN_LOWERCASED) + PROP ("Changes_When_Uppercased", PROP_CHANGES_WHEN_UPPERCASED) + PROP ("Changes_When_Titlecased", PROP_CHANGES_WHEN_TITLECASED) + PROP ("Changes_When_Casefolded", PROP_CHANGES_WHEN_CASEFOLDED) + PROP ("Changes_When_Casemapped", PROP_CHANGES_WHEN_CASEMAPPED) + PROP ("ID_Start", PROP_ID_START) + PROP ("ID_Continue", PROP_ID_CONTINUE) + PROP ("XID_Start", PROP_XID_START) + PROP ("XID_Continue", PROP_XID_CONTINUE) + PROP ("Default_Ignorable_Code_Point", PROP_DEFAULT_IGNORABLE_CODE_POINT) + PROP ("Grapheme_Extend", PROP_GRAPHEME_EXTEND) + PROP ("Grapheme_Base", PROP_GRAPHEME_BASE) + PROP ("Grapheme_Link", PROP_GRAPHEME_LINK) +#undef PROP + { + fprintf (stderr, "unknown property named '%s' in '%s'\n", propname, + proplist_filename); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_properties[i] |= 1ULL << propvalue; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", proplist_filename); + exit (1); + } +} + +/* Stores in array the given property from the Unicode 3.0 PropList.txt + file. */ +static void +fill_property30 (char array[0x110000], const char *proplist_filename, const char *property_name) +{ + unsigned int i; + FILE *stream; + char buf[100+1]; + + for (i = 0; i < 0x110000; i++) + array[i] = 0; + + stream = fopen (proplist_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", proplist_filename); + exit (1); + } + + /* Search for the "Property dump for: ..." line. */ + do + { + if (fscanf (stream, "%100[^\n]\n", buf) < 1) + { + fprintf (stderr, "no property found in '%s'\n", proplist_filename); + exit (1); + } + } + while (strstr (buf, property_name) == NULL); + + for (;;) + { + unsigned int i1, i2; + + if (fscanf (stream, "%100[^\n]\n", buf) < 1) + break; + if (buf[0] == '*') + break; + if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.') + { + if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2) + { + fprintf (stderr, "parse error in property in '%s'\n", + proplist_filename); + exit (1); + } + } + else if (strlen (buf) >= 4) + { + if (sscanf (buf, "%4X", &i1) < 1) + { + fprintf (stderr, "parse error in property in '%s'\n", + proplist_filename); + exit (1); + } + i2 = i1; + } + else + { + fprintf (stderr, "parse error in property in '%s'\n", + proplist_filename); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + for (i = i1; i <= i2; i++) + array[i] = 1; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", proplist_filename); + exit (1); + } +} + +/* Properties from Unicode 3.0 PropList.txt file. */ + +/* The paired punctuation property from the PropList.txt file. */ +char unicode_pairedpunctuation[0x110000]; + +/* The left of pair property from the PropList.txt file. */ +char unicode_leftofpair[0x110000]; + +static void +fill_properties30 (const char *proplist30_filename) +{ + fill_property30 (unicode_pairedpunctuation, proplist30_filename, "(Paired Punctuation)"); + fill_property30 (unicode_leftofpair, proplist30_filename, "(Left of Pair)"); +} + +/* ------------------------------------------------------------------------- */ + +/* See PropList.txt, UCD.html. */ +static bool +is_property_white_space (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_WHITE_SPACE)) != 0); +} + +/* See Unicode 3.0 book, section 4.10, + PropList.txt, UCD.html, + DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_alphabetic (unsigned int ch) +{ + bool result1 = + is_category_L (ch) + || ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0) + /* For some reason, the following are listed as having property + Alphabetic but not as having property Other_Alphabetic. */ + || (ch >= 0x16EE && ch <= 0x16F0) /* RUNIC SYMBOLS */ + || (ch >= 0x2160 && ch <= 0x2182) /* ROMAN NUMERALS */ + || (ch >= 0x2185 && ch <= 0x2188) /* ROMAN NUMERALS */ + || (ch >= 0x24D0 && ch <= 0x24E9) /* CIRCLED LATIN SMALL LETTER */ + || (ch == 0x3007) /* IDEOGRAPHIC NUMBER ZERO */ + || (ch >= 0x3021 && ch <= 0x3029) /* HANGZHOU NUMERAL */ + || (ch >= 0x3038 && ch <= 0x303A) /* HANGZHOU NUMERAL */ + || (ch >= 0xA6E6 && ch <= 0xA6EF) /* BAMUM LETTERS */ + || (ch >= 0x10140 && ch <= 0x10174) /* GREEK ACROPHONICS */ + || (ch == 0x10341) /* GOTHIC LETTER NINETY */ + || (ch == 0x1034A) /* GOTHIC LETTER NINE HUNDRED */ + || (ch >= 0x103D1 && ch <= 0x103D5) /* OLD PERSIAN NUMBERS */ + || (ch >= 0x12400 && ch <= 0x12462); /* CUNEIFORM NUMERIC SIGNS */ + bool result2 = + ((unicode_properties[ch] & (1ULL << PROP_ALPHABETIC)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_alphabetic (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ALPHABETIC)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_not_a_character (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_NONCHARACTER_CODE_POINT)) != 0); +} + +/* See PropList.txt, UCD.html, + DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_default_ignorable_code_point (unsigned int ch) +{ + bool result1 = + (is_category_Cf (ch) + && !(ch >= 0xFFF9 && ch <= 0xFFFB) /* Annotations */ + && !((ch >= 0x0600 && ch <= 0x0603) || ch == 0x06DD || ch == 0x070F) + /* For some reason, the following are not listed as having property + Default_Ignorable_Code_Point. */ + && !(ch == 0x110BD)) + || ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0) + || ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); + bool result2 = + ((unicode_properties[ch] & (1ULL << PROP_DEFAULT_IGNORABLE_CODE_POINT)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_default_ignorable_code_point (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_DEFAULT_IGNORABLE_CODE_POINT)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_deprecated (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_DEPRECATED)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_logical_order_exception (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_LOGICAL_ORDER_EXCEPTION)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_variation_selector (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_VARIATION_SELECTOR)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_private_use (unsigned int ch) +{ + /* Determined through "grep 'Private Use,' UnicodeData-3.1.0.txt". */ + return (ch >= 0xE000 && ch <= 0xF8FF) + || (ch >= 0xF0000 && ch <= 0xFFFFD) + || (ch >= 0x100000 && ch <= 0x10FFFD); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_unassigned_code_value (unsigned int ch) +{ + return (is_category_Cn (ch) && !is_property_not_a_character (ch)); +} + +/* See PropList.txt, UCD.html, + DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_uppercase (unsigned int ch) +{ + bool result1 = + is_category_Lu (ch) + || ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0); + bool result2 = + ((unicode_properties[ch] & (1ULL << PROP_UPPERCASE)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_uppercase (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_UPPERCASE)) != 0); +} + +/* See PropList.txt, UCD.html, + DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_lowercase (unsigned int ch) +{ + bool result1 = + is_category_Ll (ch) + || ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0); + bool result2 = + ((unicode_properties[ch] & (1ULL << PROP_LOWERCASE)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_lowercase (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_LOWERCASE)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_titlecase (unsigned int ch) +{ + return is_category_Lt (ch); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_cased (unsigned int ch) +{ + bool result1 = (is_property_lowercase (ch) + || is_property_uppercase (ch) + || is_category_Lt (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASED)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_case_ignorable (unsigned int ch) +{ + bool result1 = (is_WBP_MIDLETTER (ch) || is_WBP_MIDNUMLET (ch) + || is_category_Mn (ch) + || is_category_Me (ch) + || is_category_Cf (ch) + || is_category_Lm (ch) + || is_category_Sk (ch)); + bool result2 = ((unicode_properties[ch] & (1ULL << PROP_CASE_IGNORABLE)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_lowercased (unsigned int ch) +{ + bool result1 = ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_LOWERCASED)) != 0); + bool result2 = (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].lower != NONE + && unicode_attributes[ch].lower != ch); + + if (result1 != result2) + abort (); + return result1; +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_uppercased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_UPPERCASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_titlecased (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_TITLECASED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casefolded (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEFOLDED)) != 0); +} + +/* See DerivedCoreProperties.txt. */ +static bool +is_property_changes_when_casemapped (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_CHANGES_WHEN_CASEMAPPED)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_soft_dotted (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_SOFT_DOTTED)) != 0); +} + +/* See DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_id_start (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_ID_START)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_id_start (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_START)) != 0); +} + +/* See DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_id_continue (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_ID_CONTINUE)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_id_continue (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_ID_CONTINUE)) != 0); +} + +/* See DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_xid_start (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_XID_START)) != 0); +} + +/* See DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_xid_continue (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_XID_CONTINUE)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_pattern_white_space (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_WHITE_SPACE)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_pattern_syntax (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_PATTERN_SYNTAX)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_join_control (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_JOIN_CONTROL)) != 0); +} + +/* See DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_grapheme_base (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_BASE)) != 0); +} + +/* See DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_grapheme_extend (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_EXTEND)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_grapheme_extend (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_GRAPHEME_EXTEND)) != 0); +} + +/* See DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_grapheme_link (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_GRAPHEME_LINK)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_bidi_control (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_BIDI_CONTROL)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_left_to_right (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_L); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_hebrew_right_to_left (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_R); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_arabic_right_to_left (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_AL); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_european_digit (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_EN); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_eur_num_separator (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_ES); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_eur_num_terminator (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_ET); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_arabic_digit (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_AN); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_common_separator (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_CS); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_block_separator (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_B); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_segment_separator (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_S); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_whitespace (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_WS); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_non_spacing_mark (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_NSM); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_boundary_neutral (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_BN); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_pdf (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_PDF); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_embedding_or_override (unsigned int ch) +{ + int category = get_bidi_category (ch); + return (category == UC_BIDI_LRE || category == UC_BIDI_LRO + || category == UC_BIDI_RLE || category == UC_BIDI_RLO); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_bidi_other_neutral (unsigned int ch) +{ + return (get_bidi_category (ch) == UC_BIDI_ON); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_hex_digit (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_HEX_DIGIT)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_ascii_hex_digit (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_ASCII_HEX_DIGIT)) != 0); +} + +/* See Unicode 3.0 book, section 4.10, + PropList.txt, UCD.html. */ +static bool +is_property_ideographic (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_IDEOGRAPHIC)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_unified_ideograph (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_UNIFIED_IDEOGRAPH)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_radical (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_RADICAL)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_ids_binary_operator (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_IDS_BINARY_OPERATOR)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_ids_trinary_operator (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_IDS_TRINARY_OPERATOR)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_zero_width (unsigned int ch) +{ + return is_category_Cf (ch) + || (unicode_attributes[ch].name != NULL + && strstr (unicode_attributes[ch].name, "ZERO WIDTH") != NULL); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_space (unsigned int ch) +{ + return is_category_Zs (ch); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_non_break (unsigned int ch) +{ + /* This is exactly the set of characters having line breaking + property GL. */ + return (ch == 0x00A0 /* NO-BREAK SPACE */ + || ch == 0x034F /* COMBINING GRAPHEME JOINER */ + || ch == 0x035C /* COMBINING DOUBLE BREVE BELOW */ + || ch == 0x035D /* COMBINING DOUBLE BREVE */ + || ch == 0x035E /* COMBINING DOUBLE MACRON */ + || ch == 0x035F /* COMBINING DOUBLE MACRON BELOW */ + || ch == 0x0360 /* COMBINING DOUBLE TILDE */ + || ch == 0x0361 /* COMBINING DOUBLE INVERTED BREVE */ + || ch == 0x0362 /* COMBINING DOUBLE RIGHTWARDS ARROW BELOW */ + || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ + || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ + || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ + || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ + || ch == 0x2007 /* FIGURE SPACE */ + || ch == 0x2011 /* NON-BREAKING HYPHEN */ + || ch == 0x202F /* NARROW NO-BREAK SPACE */); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_iso_control (unsigned int ch) +{ + bool result1 = + (unicode_attributes[ch].name != NULL + && strcmp (unicode_attributes[ch].name, "<control>") == 0); + bool result2 = + is_category_Cc (ch); + + if (result1 != result2) + abort (); + return result1; +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_format_control (unsigned int ch) +{ + return (is_category_Cf (ch) + && get_bidi_category (ch) == UC_BIDI_BN + && !is_property_join_control (ch) + && ch != 0xFEFF); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_dash (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_DASH)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_hyphen (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_HYPHEN)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_punctuation (unsigned int ch) +{ + return is_category_P (ch); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_line_separator (unsigned int ch) +{ + return is_category_Zl (ch); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_paragraph_separator (unsigned int ch) +{ + return is_category_Zp (ch); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_quotation_mark (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_QUOTATION_MARK)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_sentence_terminal (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_STERM)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_terminal_punctuation (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_TERMINAL_PUNCTUATION)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_currency_symbol (unsigned int ch) +{ + return is_category_Sc (ch); +} + +/* See Unicode 3.0 book, section 4.9, + PropList.txt, UCD.html, + DerivedCoreProperties.txt, UCD.html. */ +static bool +is_property_math (unsigned int ch) +{ + bool result1 = + is_category_Sm (ch) + || ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0); + bool result2 = + ((unicode_properties[ch] & (1ULL << PROP_MATH)) != 0); + + if (result1 != result2) + abort (); + return result1; +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_other_math (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_OTHER_MATH)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_paired_punctuation (unsigned int ch) +{ + return unicode_pairedpunctuation[ch]; +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_left_of_pair (unsigned int ch) +{ + return unicode_leftofpair[ch]; +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_combining (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && (strcmp (unicode_attributes[ch].combining, "0") != 0 + || is_category_Mc (ch) + || is_category_Me (ch) + || is_category_Mn (ch))); +} + +#if 0 /* same as is_property_bidi_non_spacing_mark */ +/* See PropList-3.0.1.txt. */ +static bool +is_property_non_spacing (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && get_bidi_category (ch) == UC_BIDI_NSM); +} +#endif + +/* See PropList-3.0.1.txt. */ +static bool +is_property_composite (unsigned int ch) +{ + /* This definition differs from the one in PropList-3.0.1.txt, but is more + logical in some sense. */ + if (ch >= 0xAC00 && ch <= 0xD7A4) /* Hangul Syllables */ + return true; + if (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].decomposition != NULL) + { + /* Test whether the decomposition contains more than one character, + and the first is not a space. */ + const char *decomp = unicode_attributes[ch].decomposition; + if (decomp[0] == '<') + { + decomp = strchr (decomp, '>') + 1; + if (decomp[0] == ' ') + decomp++; + } + return strchr (decomp, ' ') != NULL && strncmp (decomp, "0020 ", 5) != 0; + } + return false; +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_decimal_digit (unsigned int ch) +{ + return is_category_Nd (ch); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_numeric (unsigned int ch) +{ + return ((get_numeric_value (ch)).denominator > 0) + || (ch == 0x09F8) /* BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR */ + || (ch == 0x2183); /* ROMAN NUMERAL REVERSED ONE HUNDRED */ +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_diacritic (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_DIACRITIC)) != 0); +} + +/* See PropList.txt, UCD.html. */ +static bool +is_property_extender (unsigned int ch) +{ + return ((unicode_properties[ch] & (1ULL << PROP_EXTENDER)) != 0); +} + +/* See PropList-3.0.1.txt. */ +static bool +is_property_ignorable_control (unsigned int ch) +{ + return ((is_category_Cc (ch) && get_bidi_category (ch) == UC_BIDI_BN) + || is_category_Cf (ch)) + && ch != 0x0000; +} + +/* ------------------------------------------------------------------------- */ + +/* Output all properties. */ +static void +output_properties (const char *version) +{ +#define PROPERTY(P) \ + debug_output_predicate ("unictype/pr_" #P ".txt", is_property_ ## P); \ + output_predicate_test ("../tests/unictype/test-pr_" #P ".c", is_property_ ## P, "uc_is_property_" #P " (c)"); \ + output_predicate ("unictype/pr_" #P ".h", is_property_ ## P, "u_property_" #P, "Properties", version); + PROPERTY(white_space) + PROPERTY(alphabetic) + PROPERTY(other_alphabetic) + PROPERTY(not_a_character) + PROPERTY(default_ignorable_code_point) + PROPERTY(other_default_ignorable_code_point) + PROPERTY(deprecated) + PROPERTY(logical_order_exception) + PROPERTY(variation_selector) + PROPERTY(private_use) + PROPERTY(unassigned_code_value) + PROPERTY(uppercase) + PROPERTY(other_uppercase) + PROPERTY(lowercase) + PROPERTY(other_lowercase) + PROPERTY(titlecase) + PROPERTY(cased) + PROPERTY(case_ignorable) + PROPERTY(changes_when_lowercased) + PROPERTY(changes_when_uppercased) + PROPERTY(changes_when_titlecased) + PROPERTY(changes_when_casefolded) + PROPERTY(changes_when_casemapped) + PROPERTY(soft_dotted) + PROPERTY(id_start) + PROPERTY(other_id_start) + PROPERTY(id_continue) + PROPERTY(other_id_continue) + PROPERTY(xid_start) + PROPERTY(xid_continue) + PROPERTY(pattern_white_space) + PROPERTY(pattern_syntax) + PROPERTY(join_control) + PROPERTY(grapheme_base) + PROPERTY(grapheme_extend) + PROPERTY(other_grapheme_extend) + PROPERTY(grapheme_link) + PROPERTY(bidi_control) + PROPERTY(bidi_left_to_right) + PROPERTY(bidi_hebrew_right_to_left) + PROPERTY(bidi_arabic_right_to_left) + PROPERTY(bidi_european_digit) + PROPERTY(bidi_eur_num_separator) + PROPERTY(bidi_eur_num_terminator) + PROPERTY(bidi_arabic_digit) + PROPERTY(bidi_common_separator) + PROPERTY(bidi_block_separator) + PROPERTY(bidi_segment_separator) + PROPERTY(bidi_whitespace) + PROPERTY(bidi_non_spacing_mark) + PROPERTY(bidi_boundary_neutral) + PROPERTY(bidi_pdf) + PROPERTY(bidi_embedding_or_override) + PROPERTY(bidi_other_neutral) + PROPERTY(hex_digit) + PROPERTY(ascii_hex_digit) + PROPERTY(ideographic) + PROPERTY(unified_ideograph) + PROPERTY(radical) + PROPERTY(ids_binary_operator) + PROPERTY(ids_trinary_operator) + PROPERTY(zero_width) + PROPERTY(space) + PROPERTY(non_break) + PROPERTY(iso_control) + PROPERTY(format_control) + PROPERTY(dash) + PROPERTY(hyphen) + PROPERTY(punctuation) + PROPERTY(line_separator) + PROPERTY(paragraph_separator) + PROPERTY(quotation_mark) + PROPERTY(sentence_terminal) + PROPERTY(terminal_punctuation) + PROPERTY(currency_symbol) + PROPERTY(math) + PROPERTY(other_math) + PROPERTY(paired_punctuation) + PROPERTY(left_of_pair) + PROPERTY(combining) + PROPERTY(composite) + PROPERTY(decimal_digit) + PROPERTY(numeric) + PROPERTY(diacritic) + PROPERTY(extender) + PROPERTY(ignorable_control) +#undef PROPERTY +} + +/* ========================================================================= */ + +/* Arabic Shaping. */ + +enum +{ + UC_JOINING_TYPE_U, /* Non_Joining */ + UC_JOINING_TYPE_T, /* Transparent */ + UC_JOINING_TYPE_C, /* Join_Causing */ + UC_JOINING_TYPE_L, /* Left_Joining */ + UC_JOINING_TYPE_R, /* Right_Joining */ + UC_JOINING_TYPE_D /* Dual_Joining */ +}; + +static uint8_t unicode_joining_type[0x110000]; + +enum +{ + UC_JOINING_GROUP_NONE, /* No_Joining_Group */ + UC_JOINING_GROUP_AIN, /* Ain */ + UC_JOINING_GROUP_ALAPH, /* Alaph */ + UC_JOINING_GROUP_ALEF, /* Alef */ + UC_JOINING_GROUP_BEH, /* Beh */ + UC_JOINING_GROUP_BETH, /* Beth */ + UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, /* Burushaski_Yeh_Barree */ + UC_JOINING_GROUP_DAL, /* Dal */ + UC_JOINING_GROUP_DALATH_RISH, /* Dalath_Rish */ + UC_JOINING_GROUP_E, /* E */ + UC_JOINING_GROUP_FARSI_YEH, /* Farsi_Yeh */ + UC_JOINING_GROUP_FE, /* Fe */ + UC_JOINING_GROUP_FEH, /* Feh */ + UC_JOINING_GROUP_FINAL_SEMKATH, /* Final_Semkath */ + UC_JOINING_GROUP_GAF, /* Gaf */ + UC_JOINING_GROUP_GAMAL, /* Gamal */ + UC_JOINING_GROUP_HAH, /* Hah */ + UC_JOINING_GROUP_HE, /* He */ + UC_JOINING_GROUP_HEH, /* Heh */ + UC_JOINING_GROUP_HEH_GOAL, /* Heh_Goal */ + UC_JOINING_GROUP_HETH, /* Heth */ + UC_JOINING_GROUP_KAF, /* Kaf */ + UC_JOINING_GROUP_KAPH, /* Kaph */ + UC_JOINING_GROUP_KHAPH, /* Khaph */ + UC_JOINING_GROUP_KNOTTED_HEH, /* Knotted_Heh */ + UC_JOINING_GROUP_LAM, /* Lam */ + UC_JOINING_GROUP_LAMADH, /* Lamadh */ + UC_JOINING_GROUP_MEEM, /* Meem */ + UC_JOINING_GROUP_MIM, /* Mim */ + UC_JOINING_GROUP_NOON, /* Noon */ + UC_JOINING_GROUP_NUN, /* Nun */ + UC_JOINING_GROUP_NYA, /* Nya */ + UC_JOINING_GROUP_PE, /* Pe */ + UC_JOINING_GROUP_QAF, /* Qaf */ + UC_JOINING_GROUP_QAPH, /* Qaph */ + UC_JOINING_GROUP_REH, /* Reh */ + UC_JOINING_GROUP_REVERSED_PE, /* Reversed_Pe */ + UC_JOINING_GROUP_SAD, /* Sad */ + UC_JOINING_GROUP_SADHE, /* Sadhe */ + UC_JOINING_GROUP_SEEN, /* Seen */ + UC_JOINING_GROUP_SEMKATH, /* Semkath */ + UC_JOINING_GROUP_SHIN, /* Shin */ + UC_JOINING_GROUP_SWASH_KAF, /* Swash_Kaf */ + UC_JOINING_GROUP_SYRIAC_WAW, /* Syriac_Waw */ + UC_JOINING_GROUP_TAH, /* Tah */ + UC_JOINING_GROUP_TAW, /* Taw */ + UC_JOINING_GROUP_TEH_MARBUTA, /* Teh_Marbuta */ + UC_JOINING_GROUP_TEH_MARBUTA_GOAL, /* Teh_Marbuta_Goal */ + UC_JOINING_GROUP_TETH, /* Teth */ + UC_JOINING_GROUP_WAW, /* Waw */ + UC_JOINING_GROUP_YEH, /* Yeh */ + UC_JOINING_GROUP_YEH_BARREE, /* Yeh_Barree */ + UC_JOINING_GROUP_YEH_WITH_TAIL, /* Yeh_With_Tail */ + UC_JOINING_GROUP_YUDH, /* Yudh */ + UC_JOINING_GROUP_YUDH_HE, /* Yudh_He */ + UC_JOINING_GROUP_ZAIN, /* Zain */ + UC_JOINING_GROUP_ZHAIN /* Zhain */ +}; + +static uint8_t unicode_joining_group[0x110000]; + +static void +fill_arabicshaping (const char *arabicshaping_filename) +{ + FILE *stream; + unsigned int i; + int lineno; + + stream = fopen (arabicshaping_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", arabicshaping_filename); + exit (1); + } + + for (i = 0; i < 0x110000; i++) + { + unicode_joining_type[i] = (uint8_t)~(uint8_t)0; + unicode_joining_group[i] = UC_JOINING_GROUP_NONE; + } + + lineno = 0; + for (;;) + { + char buf[100+1]; + char separator1[100+1]; + char padding1[100+1]; + char schematic_name[100+1]; + char separator2[100+1]; + char padding2[100+1]; + char joining_type_name[100+1]; + char separator3[100+1]; + char padding3[100+1]; + char joining_group_name[100+1]; + int joining_type; + int joining_group; + + lineno++; + if (fscanf (stream, "%100[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X%[;]%[ ]%[^;]%[;]%[ ]%[^;]%[;]%[ ]%100[^\n]", + &i, separator1, padding1, schematic_name, separator2, + padding2, joining_type_name, separator3, padding3, + joining_group_name) != 10) + { + fprintf (stderr, "parse error in '%s':%d\n", + arabicshaping_filename, lineno); + exit (1); + } + if (i >= 0x110000) + abort (); + +#define TRY(name) else if (strcmp (joining_type_name, #name + 16) == 0) joining_type = name; + if (false) {} + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + else + { + fprintf (stderr, "unknown joining type value \"%s\" in '%s':%d\n", + joining_type_name, arabicshaping_filename, lineno); + exit (1); + } + + /* Remove trailing spaces. */ + while (joining_group_name[0] != '\0' + && joining_group_name[strlen (joining_group_name) - 1] == ' ') + joining_group_name[strlen (joining_group_name) - 1] = '\0'; + +#define TRY(value,name) else if (strcmp (joining_group_name, name) == 0) joining_group = value; + if (false) {} + TRY(UC_JOINING_GROUP_NONE, "No_Joining_Group") + TRY(UC_JOINING_GROUP_AIN, "AIN") + TRY(UC_JOINING_GROUP_ALAPH, "ALAPH") + TRY(UC_JOINING_GROUP_ALEF, "ALEF") + TRY(UC_JOINING_GROUP_BEH, "BEH") + TRY(UC_JOINING_GROUP_BETH, "BETH") + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE, "BURUSHASKI YEH BARREE") + TRY(UC_JOINING_GROUP_DAL, "DAL") + TRY(UC_JOINING_GROUP_DALATH_RISH, "DALATH RISH") + TRY(UC_JOINING_GROUP_E, "E") + TRY(UC_JOINING_GROUP_FARSI_YEH, "FARSI YEH") + TRY(UC_JOINING_GROUP_FE, "FE") + TRY(UC_JOINING_GROUP_FEH, "FEH") + TRY(UC_JOINING_GROUP_FINAL_SEMKATH, "FINAL SEMKATH") + TRY(UC_JOINING_GROUP_GAF, "GAF") + TRY(UC_JOINING_GROUP_GAMAL, "GAMAL") + TRY(UC_JOINING_GROUP_HAH, "HAH") + TRY(UC_JOINING_GROUP_HE, "HE") + TRY(UC_JOINING_GROUP_HEH, "HEH") + TRY(UC_JOINING_GROUP_HEH_GOAL, "HEH GOAL") + TRY(UC_JOINING_GROUP_HETH, "HETH") + TRY(UC_JOINING_GROUP_KAF, "KAF") + TRY(UC_JOINING_GROUP_KAPH, "KAPH") + TRY(UC_JOINING_GROUP_KHAPH, "KHAPH") + TRY(UC_JOINING_GROUP_KNOTTED_HEH, "KNOTTED HEH") + TRY(UC_JOINING_GROUP_LAM, "LAM") + TRY(UC_JOINING_GROUP_LAMADH, "LAMADH") + TRY(UC_JOINING_GROUP_MEEM, "MEEM") + TRY(UC_JOINING_GROUP_MIM, "MIM") + TRY(UC_JOINING_GROUP_NOON, "NOON") + TRY(UC_JOINING_GROUP_NUN, "NUN") + TRY(UC_JOINING_GROUP_NYA, "NYA") + TRY(UC_JOINING_GROUP_PE, "PE") + TRY(UC_JOINING_GROUP_QAF, "QAF") + TRY(UC_JOINING_GROUP_QAPH, "QAPH") + TRY(UC_JOINING_GROUP_REH, "REH") + TRY(UC_JOINING_GROUP_REVERSED_PE, "REVERSED PE") + TRY(UC_JOINING_GROUP_SAD, "SAD") + TRY(UC_JOINING_GROUP_SADHE, "SADHE") + TRY(UC_JOINING_GROUP_SEEN, "SEEN") + TRY(UC_JOINING_GROUP_SEMKATH, "SEMKATH") + TRY(UC_JOINING_GROUP_SHIN, "SHIN") + TRY(UC_JOINING_GROUP_SWASH_KAF, "SWASH KAF") + TRY(UC_JOINING_GROUP_SYRIAC_WAW, "SYRIAC WAW") + TRY(UC_JOINING_GROUP_TAH, "TAH") + TRY(UC_JOINING_GROUP_TAW, "TAW") + TRY(UC_JOINING_GROUP_TEH_MARBUTA, "TEH MARBUTA") + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL, "TEH MARBUTA GOAL") + TRY(UC_JOINING_GROUP_TETH, "TETH") + TRY(UC_JOINING_GROUP_WAW, "WAW") + TRY(UC_JOINING_GROUP_YEH, "YEH") + TRY(UC_JOINING_GROUP_YEH_BARREE, "YEH BARREE") + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL, "YEH WITH TAIL") + TRY(UC_JOINING_GROUP_YUDH, "YUDH") + TRY(UC_JOINING_GROUP_YUDH_HE, "YUDH HE") + TRY(UC_JOINING_GROUP_ZAIN, "ZAIN") + TRY(UC_JOINING_GROUP_ZHAIN, "ZHAIN") +#undef TRY + else + { + fprintf (stderr, "unknown joining group value \"%s\" in '%s':%d\n", + joining_group_name, arabicshaping_filename, lineno); + exit (1); + } + + unicode_joining_type[i] = joining_type; + unicode_joining_group[i] = joining_group; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", arabicshaping_filename); + exit (1); + } +} + +/* Convert a Joining_Type value to a C identifier. */ +static const char * +joining_type_as_c_identifier (int joining_type) +{ +#define TRY(value) if (joining_type == value) return #value; + TRY(UC_JOINING_TYPE_U) + TRY(UC_JOINING_TYPE_T) + TRY(UC_JOINING_TYPE_C) + TRY(UC_JOINING_TYPE_L) + TRY(UC_JOINING_TYPE_R) + TRY(UC_JOINING_TYPE_D) +#undef TRY + abort (); +} + +static void +output_joining_type_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = unicode_joining_type[ch]; + + if (value != (uint8_t)~(uint8_t)0) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_type_as_c_identifier (value)); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE joining_type_table +#define ELEMENT uint8_t +#define DEFAULT (uint8_t)~(uint8_t)0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_joining_type (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct joining_type_table t; + unsigned int level1_offset, level2_offset, level3_offset; + uint8_t *level3_packed; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + joining_type_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + uint8_t value = unicode_joining_type[ch]; + + joining_type_table_add (&t, ch, value); + } + + joining_type_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define joining_type_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu * %d];\n", t.level3_size, + (1 << t.p) * 4 / 8); + fprintf (stream, " }\n"); + fprintf (stream, "u_joining_type =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 4 bits only. */ + level3_packed = + (uint8_t *) calloc ((t.level3_size << t.p) * 4 / 8, sizeof (uint8_t)); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned int j = (i * 4) / 8; + unsigned int k = (i * 4) % 8; + uint32_t value = ((unsigned char *) (t.result + level3_offset))[i] & 0x0f; + level3_packed[j] |= (value << k); + } + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 4 / 8 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 4 / 8; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x", level3_packed[i]); + if (i+1 < (t.level3_size << t.p) * 4 / 8) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 4 / 8 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + free (level3_packed); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Convert a Joining_Group value to a C identifier. */ +static const char * +joining_group_as_c_identifier (int joining_group) +{ +#define TRY(value) if (joining_group == value) return #value; + TRY(UC_JOINING_GROUP_NONE) + TRY(UC_JOINING_GROUP_AIN) + TRY(UC_JOINING_GROUP_ALAPH) + TRY(UC_JOINING_GROUP_ALEF) + TRY(UC_JOINING_GROUP_BEH) + TRY(UC_JOINING_GROUP_BETH) + TRY(UC_JOINING_GROUP_BURUSHASKI_YEH_BARREE) + TRY(UC_JOINING_GROUP_DAL) + TRY(UC_JOINING_GROUP_DALATH_RISH) + TRY(UC_JOINING_GROUP_E) + TRY(UC_JOINING_GROUP_FARSI_YEH) + TRY(UC_JOINING_GROUP_FE) + TRY(UC_JOINING_GROUP_FEH) + TRY(UC_JOINING_GROUP_FINAL_SEMKATH) + TRY(UC_JOINING_GROUP_GAF) + TRY(UC_JOINING_GROUP_GAMAL) + TRY(UC_JOINING_GROUP_HAH) + TRY(UC_JOINING_GROUP_HE) + TRY(UC_JOINING_GROUP_HEH) + TRY(UC_JOINING_GROUP_HEH_GOAL) + TRY(UC_JOINING_GROUP_HETH) + TRY(UC_JOINING_GROUP_KAF) + TRY(UC_JOINING_GROUP_KAPH) + TRY(UC_JOINING_GROUP_KHAPH) + TRY(UC_JOINING_GROUP_KNOTTED_HEH) + TRY(UC_JOINING_GROUP_LAM) + TRY(UC_JOINING_GROUP_LAMADH) + TRY(UC_JOINING_GROUP_MEEM) + TRY(UC_JOINING_GROUP_MIM) + TRY(UC_JOINING_GROUP_NOON) + TRY(UC_JOINING_GROUP_NUN) + TRY(UC_JOINING_GROUP_NYA) + TRY(UC_JOINING_GROUP_PE) + TRY(UC_JOINING_GROUP_QAF) + TRY(UC_JOINING_GROUP_QAPH) + TRY(UC_JOINING_GROUP_REH) + TRY(UC_JOINING_GROUP_REVERSED_PE) + TRY(UC_JOINING_GROUP_SAD) + TRY(UC_JOINING_GROUP_SADHE) + TRY(UC_JOINING_GROUP_SEEN) + TRY(UC_JOINING_GROUP_SEMKATH) + TRY(UC_JOINING_GROUP_SHIN) + TRY(UC_JOINING_GROUP_SWASH_KAF) + TRY(UC_JOINING_GROUP_SYRIAC_WAW) + TRY(UC_JOINING_GROUP_TAH) + TRY(UC_JOINING_GROUP_TAW) + TRY(UC_JOINING_GROUP_TEH_MARBUTA) + TRY(UC_JOINING_GROUP_TEH_MARBUTA_GOAL) + TRY(UC_JOINING_GROUP_TETH) + TRY(UC_JOINING_GROUP_WAW) + TRY(UC_JOINING_GROUP_YEH) + TRY(UC_JOINING_GROUP_YEH_BARREE) + TRY(UC_JOINING_GROUP_YEH_WITH_TAIL) + TRY(UC_JOINING_GROUP_YUDH) + TRY(UC_JOINING_GROUP_YUDH_HE) + TRY(UC_JOINING_GROUP_ZAIN) + TRY(UC_JOINING_GROUP_ZHAIN) +#undef TRY + abort (); +} + +static void +output_joining_group_test (const char *filename, const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining group of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int value = unicode_joining_group[ch]; + + if (value != UC_JOINING_GROUP_NONE) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, %s }", ch, joining_group_as_c_identifier (value)); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_joining_group (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch_min, ch_max, ch, i; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Arabic joining type of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + ch_min = 0x10FFFF; + for (ch = 0; ch < 0x110000; ch++) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_min = ch; + break; + } + + ch_max = 0; + for (ch = 0x10FFFF; ch > 0; ch--) + if (unicode_joining_group[ch] != UC_JOINING_GROUP_NONE) + { + ch_max = ch; + break; + } + + if (!(ch_min <= ch_max)) + abort (); + + /* If the interval [ch_min, ch_max] is too large, we should better use a + 3-level table. */ + if (!(ch_max - ch_min < 0x200)) + abort (); + + fprintf (stream, "#define joining_group_header_0 0x%x\n", ch_min); + fprintf (stream, "static const unsigned char u_joining_group[0x%x - 0x%x] =\n", + ch_max + 1, ch_min); + fprintf (stream, "{"); + for (i = 0; i <= ch_max - ch_min; i++) + { + const char *s; + + ch = ch_min + i; + if ((i % 2) == 0) + fprintf (stream, "\n "); + s = joining_group_as_c_identifier (unicode_joining_group[ch]); + fprintf (stream, " %s", s); + if (i+1 <= ch_max - ch_min) + { + fprintf (stream, ","); + if (((i+1) % 2) != 0) + fprintf (stream, "%*s", 38 - (int) strlen (s), ""); + } + } + fprintf (stream, "\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Scripts. */ + +static const char *scripts[256]; +static unsigned int numscripts; + +static uint8_t unicode_scripts[0x110000]; + +static void +fill_scripts (const char *scripts_filename) +{ + FILE *stream; + unsigned int i; + + stream = fopen (scripts_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", scripts_filename); + exit (1); + } + + numscripts = 0; + + for (i = 0; i < 0x110000; i++) + unicode_scripts[i] = (uint8_t)~(uint8_t)0; + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char scriptname[200+1]; + int script; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, scriptname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, scriptname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", scripts_filename); + exit (1); + } + i2 = i1; + } + if (i2 < i1) + abort (); + if (i2 >= 0x110000) + abort (); + + for (script = numscripts - 1; script >= 0; script--) + if (strcmp (scripts[script], scriptname) == 0) + break; + if (script < 0) + { + scripts[numscripts] = strdup (scriptname); + script = numscripts; + numscripts++; + if (numscripts == 256) + abort (); + } + + for (i = i1; i <= i2; i++) + { + if (unicode_scripts[i] != (uint8_t)~(uint8_t)0) + fprintf (stderr, "0x%04X belongs to multiple scripts\n", i); + unicode_scripts[i] = script; + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", scripts_filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE script_table +#define ELEMENT uint8_t +#define DEFAULT (uint8_t)~(uint8_t)0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_scripts (const char *version) +{ + const char *filename = "unictype/scripts.h"; + FILE *stream; + unsigned int ch, s, i; + struct script_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + typedef struct + { + const char *lowercase_name; + } + scriptinfo_t; + scriptinfo_t scriptinfo[256]; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Unicode scripts. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + for (s = 0; s < numscripts; s++) + { + char *lcp = strdup (scripts[s]); + char *cp; + + for (cp = lcp; *cp != '\0'; cp++) + if (*cp >= 'A' && *cp <= 'Z') + *cp += 'a' - 'A'; + + scriptinfo[s].lowercase_name = lcp; + } + + for (s = 0; s < numscripts; s++) + { + fprintf (stream, "static const uc_interval_t script_%s_intervals[] =\n", + scriptinfo[s].lowercase_name); + fprintf (stream, "{\n"); + i = 0; + for (ch = 0; ch < 0x110000; ch++) + if (unicode_scripts[ch] == s) + { + unsigned int start; + unsigned int end; + + start = ch; + while (ch + 1 < 0x110000 && unicode_scripts[ch + 1] == s) + ch++; + end = ch; + + if (i > 0) + fprintf (stream, ",\n"); + if (start == end) + fprintf (stream, " { 0x%04X, 1, 1 }", start); + else + fprintf (stream, " { 0x%04X, 1, 0 }, { 0x%04X, 0, 1 }", + start, end); + i++; + } + fprintf (stream, "\n"); + fprintf (stream, "};\n"); + } + + fprintf (stream, "static const uc_script_t scripts[%d] =\n", numscripts); + fprintf (stream, "{\n"); + for (s = 0; s < numscripts; s++) + { + fprintf (stream, " {\n"); + fprintf (stream, " sizeof (script_%s_intervals) / sizeof (uc_interval_t),\n", + scriptinfo[s].lowercase_name); + fprintf (stream, " script_%s_intervals,\n", + scriptinfo[s].lowercase_name); + fprintf (stream, " \"%s\"\n", scripts[s]); + fprintf (stream, " }"); + if (s+1 < numscripts) + fprintf (stream, ","); + fprintf (stream, "\n"); + } + fprintf (stream, "};\n"); + + t.p = 7; + t.q = 9; + script_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int s = unicode_scripts[ch]; + if (s != (uint8_t)~(uint8_t)0) + script_table_add (&t, ch, s); + } + + script_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define script_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_script =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %3d", ((uint8_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_scripts_byname (const char *version) +{ + const char *filename = "unictype/scripts_byname.gperf"; + FILE *stream; + unsigned int s; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Unicode scripts. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "struct named_script { int name; unsigned int index; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define hash-function-name scripts_hash\n"); + fprintf (stream, "%%define lookup-function-name uc_script_lookup\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%global-table\n"); + fprintf (stream, "%%define word-array-name script_names\n"); + fprintf (stream, "%%pic\n"); + fprintf (stream, "%%define string-pool-name script_stringpool\n"); + fprintf (stream, "%%%%\n"); + for (s = 0; s < numscripts; s++) + fprintf (stream, "%s, %u\n", scripts[s], s); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Blocks. */ + +typedef struct { unsigned int start; unsigned int end; const char *name; } + block_t; +static block_t blocks[256]; +static unsigned int numblocks; + +static void +fill_blocks (const char *blocks_filename) +{ + FILE *stream; + + stream = fopen (blocks_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", blocks_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char blockname[200+1]; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^\r]", &i1, &i2, padding, blockname) != 4) + { + fprintf (stderr, "parse error in '%s'\n", blocks_filename); + exit (1); + } + blocks[numblocks].start = i1; + blocks[numblocks].end = i2; + blocks[numblocks].name = strdup (blockname); + /* It must be sorted. */ + if (numblocks > 0 && !(blocks[numblocks-1].end < blocks[numblocks].start)) + abort (); + numblocks++; + if (numblocks == 256) + abort (); + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", blocks_filename); + exit (1); + } +} + +/* Return the smallest block index among the blocks for characters >= ch. */ +static unsigned int +block_first_index (unsigned int ch) +{ + /* Binary search. */ + unsigned int lo = 0; + unsigned int hi = numblocks; + /* Invariants: + All blocks[i], i < lo, have blocks[i].end < ch, + all blocks[i], i >= hi, have blocks[i].end >= ch. */ + while (lo < hi) + { + unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */ + if (blocks[mid].end < ch) + lo = mid + 1; + else + hi = mid; + } + return hi; +} + +/* Return the largest block index among the blocks for characters <= ch, + plus 1. */ +static unsigned int +block_last_index (unsigned int ch) +{ + /* Binary search. */ + unsigned int lo = 0; + unsigned int hi = numblocks; + /* Invariants: + All blocks[i], i < lo, have blocks[i].start <= ch, + all blocks[i], i >= hi, have blocks[i].start > ch. */ + while (lo < hi) + { + unsigned int mid = (lo + hi) / 2; /* >= lo, < hi */ + if (blocks[mid].start <= ch) + lo = mid + 1; + else + hi = mid; + } + return hi; +} + +static void +output_blocks (const char *version) +{ + const char *filename = "unictype/blocks.h"; + const unsigned int shift = 8; /* bits to shift away for array access */ + const unsigned int threshold = 0x30000; /* cut-off table here to save space */ + FILE *stream; + unsigned int i; + unsigned int i1; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Unicode blocks. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + fprintf (stream, "static const uc_block_t blocks[] =\n"); + fprintf (stream, "{\n"); + for (i = 0; i < numblocks; i++) + { + fprintf (stream, " { 0x%04X, 0x%04X, \"%s\" }", blocks[i].start, + blocks[i].end, blocks[i].name); + if (i+1 < numblocks) + fprintf (stream, ","); + fprintf (stream, "\n"); + } + fprintf (stream, "};\n"); + fprintf (stream, "#define blocks_level1_shift %d\n", shift); + fprintf (stream, "#define blocks_level1_threshold 0x%04X\n", threshold); + fprintf (stream, "static const uint8_t blocks_level1[%d * 2] =\n", + threshold >> shift); + fprintf (stream, "{\n"); + for (i1 = 0; i1 < (threshold >> shift); i1++) + { + unsigned int first_index = block_first_index (i1 << shift); + unsigned int last_index = block_last_index (((i1 + 1) << shift) - 1); + fprintf (stream, " %3d, %3d", first_index, last_index); + if (i1+1 < (threshold >> shift)) + fprintf (stream, ","); + fprintf (stream, "\n"); + } + fprintf (stream, "};\n"); + fprintf (stream, "#define blocks_upper_first_index %d\n", + block_first_index (threshold)); + fprintf (stream, "#define blocks_upper_last_index %d\n", + block_last_index (0x10FFFF)); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* C and Java syntax. */ + +enum +{ + UC_IDENTIFIER_START, /* valid as first or subsequent character */ + UC_IDENTIFIER_VALID, /* valid as subsequent character only */ + UC_IDENTIFIER_INVALID, /* not valid */ + UC_IDENTIFIER_IGNORABLE /* ignorable (Java only) */ +}; + +/* ISO C 99 section 6.4.(3). */ +static bool +is_c_whitespace (unsigned int ch) +{ + return (ch == ' ' /* space */ + || ch == '\t' /* horizontal tab */ + || ch == '\n' || ch == '\r' /* new-line */ + || ch == '\v' /* vertical tab */ + || ch == '\f'); /* form-feed */ +} + +/* ISO C 99 section 6.4.2.1 and appendix D. */ +static int +c_ident_category (unsigned int ch) +{ + /* Section 6.4.2.1. */ + if (ch >= '0' && ch <= '9') + return UC_IDENTIFIER_VALID; + if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_') + return UC_IDENTIFIER_START; + /* Appendix D. */ + if (0 + /* Latin */ + || (ch == 0x00AA) + || (ch == 0x00BA) + || (ch >= 0x00C0 && ch <= 0x00D6) + || (ch >= 0x00D8 && ch <= 0x00F6) + || (ch >= 0x00F8 && ch <= 0x01F5) + || (ch >= 0x01FA && ch <= 0x0217) + || (ch >= 0x0250 && ch <= 0x02A8) + || (ch >= 0x1E00 && ch <= 0x1E9B) + || (ch >= 0x1EA0 && ch <= 0x1EF9) + || (ch == 0x207F) + /* Greek */ + || (ch == 0x0386) + || (ch >= 0x0388 && ch <= 0x038A) + || (ch == 0x038C) + || (ch >= 0x038E && ch <= 0x03A1) + || (ch >= 0x03A3 && ch <= 0x03CE) + || (ch >= 0x03D0 && ch <= 0x03D6) + || (ch == 0x03DA) + || (ch == 0x03DC) + || (ch == 0x03DE) + || (ch == 0x03E0) + || (ch >= 0x03E2 && ch <= 0x03F3) + || (ch >= 0x1F00 && ch <= 0x1F15) + || (ch >= 0x1F18 && ch <= 0x1F1D) + || (ch >= 0x1F20 && ch <= 0x1F45) + || (ch >= 0x1F48 && ch <= 0x1F4D) + || (ch >= 0x1F50 && ch <= 0x1F57) + || (ch == 0x1F59) + || (ch == 0x1F5B) + || (ch == 0x1F5D) + || (ch >= 0x1F5F && ch <= 0x1F7D) + || (ch >= 0x1F80 && ch <= 0x1FB4) + || (ch >= 0x1FB6 && ch <= 0x1FBC) + || (ch >= 0x1FC2 && ch <= 0x1FC4) + || (ch >= 0x1FC6 && ch <= 0x1FCC) + || (ch >= 0x1FD0 && ch <= 0x1FD3) + || (ch >= 0x1FD6 && ch <= 0x1FDB) + || (ch >= 0x1FE0 && ch <= 0x1FEC) + || (ch >= 0x1FF2 && ch <= 0x1FF4) + || (ch >= 0x1FF6 && ch <= 0x1FFC) + /* Cyrillic */ + || (ch >= 0x0401 && ch <= 0x040C) + || (ch >= 0x040E && ch <= 0x044F) + || (ch >= 0x0451 && ch <= 0x045C) + || (ch >= 0x045E && ch <= 0x0481) + || (ch >= 0x0490 && ch <= 0x04C4) + || (ch >= 0x04C7 && ch <= 0x04C8) + || (ch >= 0x04CB && ch <= 0x04CC) + || (ch >= 0x04D0 && ch <= 0x04EB) + || (ch >= 0x04EE && ch <= 0x04F5) + || (ch >= 0x04F8 && ch <= 0x04F9) + /* Armenian */ + || (ch >= 0x0531 && ch <= 0x0556) + || (ch >= 0x0561 && ch <= 0x0587) + /* Hebrew */ + || (ch >= 0x05B0 && ch <= 0x05B9) + || (ch >= 0x05BB && ch <= 0x05BD) + || (ch == 0x05BF) + || (ch >= 0x05C1 && ch <= 0x05C2) + || (ch >= 0x05D0 && ch <= 0x05EA) + || (ch >= 0x05F0 && ch <= 0x05F2) + /* Arabic */ + || (ch >= 0x0621 && ch <= 0x063A) + || (ch >= 0x0640 && ch <= 0x0652) + || (ch >= 0x0670 && ch <= 0x06B7) + || (ch >= 0x06BA && ch <= 0x06BE) + || (ch >= 0x06C0 && ch <= 0x06CE) + || (ch >= 0x06D0 && ch <= 0x06DC) + || (ch >= 0x06E5 && ch <= 0x06E8) + || (ch >= 0x06EA && ch <= 0x06ED) + /* Devanagari */ + || (ch >= 0x0901 && ch <= 0x0903) + || (ch >= 0x0905 && ch <= 0x0939) + || (ch >= 0x093E && ch <= 0x094D) + || (ch >= 0x0950 && ch <= 0x0952) + || (ch >= 0x0958 && ch <= 0x0963) + /* Bengali */ + || (ch >= 0x0981 && ch <= 0x0983) + || (ch >= 0x0985 && ch <= 0x098C) + || (ch >= 0x098F && ch <= 0x0990) + || (ch >= 0x0993 && ch <= 0x09A8) + || (ch >= 0x09AA && ch <= 0x09B0) + || (ch == 0x09B2) + || (ch >= 0x09B6 && ch <= 0x09B9) + || (ch >= 0x09BE && ch <= 0x09C4) + || (ch >= 0x09C7 && ch <= 0x09C8) + || (ch >= 0x09CB && ch <= 0x09CD) + || (ch >= 0x09DC && ch <= 0x09DD) + || (ch >= 0x09DF && ch <= 0x09E3) + || (ch >= 0x09F0 && ch <= 0x09F1) + /* Gurmukhi */ + || (ch == 0x0A02) + || (ch >= 0x0A05 && ch <= 0x0A0A) + || (ch >= 0x0A0F && ch <= 0x0A10) + || (ch >= 0x0A13 && ch <= 0x0A28) + || (ch >= 0x0A2A && ch <= 0x0A30) + || (ch >= 0x0A32 && ch <= 0x0A33) + || (ch >= 0x0A35 && ch <= 0x0A36) + || (ch >= 0x0A38 && ch <= 0x0A39) + || (ch >= 0x0A3E && ch <= 0x0A42) + || (ch >= 0x0A47 && ch <= 0x0A48) + || (ch >= 0x0A4B && ch <= 0x0A4D) + || (ch >= 0x0A59 && ch <= 0x0A5C) + || (ch == 0x0A5E) + || (ch == 0x0A74) + /* Gujarati */ + || (ch >= 0x0A81 && ch <= 0x0A83) + || (ch >= 0x0A85 && ch <= 0x0A8B) + || (ch == 0x0A8D) + || (ch >= 0x0A8F && ch <= 0x0A91) + || (ch >= 0x0A93 && ch <= 0x0AA8) + || (ch >= 0x0AAA && ch <= 0x0AB0) + || (ch >= 0x0AB2 && ch <= 0x0AB3) + || (ch >= 0x0AB5 && ch <= 0x0AB9) + || (ch >= 0x0ABD && ch <= 0x0AC5) + || (ch >= 0x0AC7 && ch <= 0x0AC9) + || (ch >= 0x0ACB && ch <= 0x0ACD) + || (ch == 0x0AD0) + || (ch == 0x0AE0) + /* Oriya */ + || (ch >= 0x0B01 && ch <= 0x0B03) + || (ch >= 0x0B05 && ch <= 0x0B0C) + || (ch >= 0x0B0F && ch <= 0x0B10) + || (ch >= 0x0B13 && ch <= 0x0B28) + || (ch >= 0x0B2A && ch <= 0x0B30) + || (ch >= 0x0B32 && ch <= 0x0B33) + || (ch >= 0x0B36 && ch <= 0x0B39) + || (ch >= 0x0B3E && ch <= 0x0B43) + || (ch >= 0x0B47 && ch <= 0x0B48) + || (ch >= 0x0B4B && ch <= 0x0B4D) + || (ch >= 0x0B5C && ch <= 0x0B5D) + || (ch >= 0x0B5F && ch <= 0x0B61) + /* Tamil */ + || (ch >= 0x0B82 && ch <= 0x0B83) + || (ch >= 0x0B85 && ch <= 0x0B8A) + || (ch >= 0x0B8E && ch <= 0x0B90) + || (ch >= 0x0B92 && ch <= 0x0B95) + || (ch >= 0x0B99 && ch <= 0x0B9A) + || (ch == 0x0B9C) + || (ch >= 0x0B9E && ch <= 0x0B9F) + || (ch >= 0x0BA3 && ch <= 0x0BA4) + || (ch >= 0x0BA8 && ch <= 0x0BAA) + || (ch >= 0x0BAE && ch <= 0x0BB5) + || (ch >= 0x0BB7 && ch <= 0x0BB9) + || (ch >= 0x0BBE && ch <= 0x0BC2) + || (ch >= 0x0BC6 && ch <= 0x0BC8) + || (ch >= 0x0BCA && ch <= 0x0BCD) + /* Telugu */ + || (ch >= 0x0C01 && ch <= 0x0C03) + || (ch >= 0x0C05 && ch <= 0x0C0C) + || (ch >= 0x0C0E && ch <= 0x0C10) + || (ch >= 0x0C12 && ch <= 0x0C28) + || (ch >= 0x0C2A && ch <= 0x0C33) + || (ch >= 0x0C35 && ch <= 0x0C39) + || (ch >= 0x0C3E && ch <= 0x0C44) + || (ch >= 0x0C46 && ch <= 0x0C48) + || (ch >= 0x0C4A && ch <= 0x0C4D) + || (ch >= 0x0C60 && ch <= 0x0C61) + /* Kannada */ + || (ch >= 0x0C82 && ch <= 0x0C83) + || (ch >= 0x0C85 && ch <= 0x0C8C) + || (ch >= 0x0C8E && ch <= 0x0C90) + || (ch >= 0x0C92 && ch <= 0x0CA8) + || (ch >= 0x0CAA && ch <= 0x0CB3) + || (ch >= 0x0CB5 && ch <= 0x0CB9) + || (ch >= 0x0CBE && ch <= 0x0CC4) + || (ch >= 0x0CC6 && ch <= 0x0CC8) + || (ch >= 0x0CCA && ch <= 0x0CCD) + || (ch == 0x0CDE) + || (ch >= 0x0CE0 && ch <= 0x0CE1) + /* Malayalam */ + || (ch >= 0x0D02 && ch <= 0x0D03) + || (ch >= 0x0D05 && ch <= 0x0D0C) + || (ch >= 0x0D0E && ch <= 0x0D10) + || (ch >= 0x0D12 && ch <= 0x0D28) + || (ch >= 0x0D2A && ch <= 0x0D39) + || (ch >= 0x0D3E && ch <= 0x0D43) + || (ch >= 0x0D46 && ch <= 0x0D48) + || (ch >= 0x0D4A && ch <= 0x0D4D) + || (ch >= 0x0D60 && ch <= 0x0D61) + /* Thai */ + || (ch >= 0x0E01 && ch <= 0x0E3A) + || (ch >= 0x0E40 && ch <= 0x0E5B) + /* Lao */ + || (ch >= 0x0E81 && ch <= 0x0E82) + || (ch == 0x0E84) + || (ch >= 0x0E87 && ch <= 0x0E88) + || (ch == 0x0E8A) + || (ch == 0x0E8D) + || (ch >= 0x0E94 && ch <= 0x0E97) + || (ch >= 0x0E99 && ch <= 0x0E9F) + || (ch >= 0x0EA1 && ch <= 0x0EA3) + || (ch == 0x0EA5) + || (ch == 0x0EA7) + || (ch >= 0x0EAA && ch <= 0x0EAB) + || (ch >= 0x0EAD && ch <= 0x0EAE) + || (ch >= 0x0EB0 && ch <= 0x0EB9) + || (ch >= 0x0EBB && ch <= 0x0EBD) + || (ch >= 0x0EC0 && ch <= 0x0EC4) + || (ch == 0x0EC6) + || (ch >= 0x0EC8 && ch <= 0x0ECD) + || (ch >= 0x0EDC && ch <= 0x0EDD) + /* Tibetan */ + || (ch == 0x0F00) + || (ch >= 0x0F18 && ch <= 0x0F19) + || (ch == 0x0F35) + || (ch == 0x0F37) + || (ch == 0x0F39) + || (ch >= 0x0F3E && ch <= 0x0F47) + || (ch >= 0x0F49 && ch <= 0x0F69) + || (ch >= 0x0F71 && ch <= 0x0F84) + || (ch >= 0x0F86 && ch <= 0x0F8B) + || (ch >= 0x0F90 && ch <= 0x0F95) + || (ch == 0x0F97) + || (ch >= 0x0F99 && ch <= 0x0FAD) + || (ch >= 0x0FB1 && ch <= 0x0FB7) + || (ch == 0x0FB9) + /* Georgian */ + || (ch >= 0x10A0 && ch <= 0x10C5) + || (ch >= 0x10D0 && ch <= 0x10F6) + /* Hiragana */ + || (ch >= 0x3041 && ch <= 0x3093) + || (ch >= 0x309B && ch <= 0x309C) + /* Katakana */ + || (ch >= 0x30A1 && ch <= 0x30F6) + || (ch >= 0x30FB && ch <= 0x30FC) + /* Bopomofo */ + || (ch >= 0x3105 && ch <= 0x312C) + /* CJK Unified Ideographs */ + || (ch >= 0x4E00 && ch <= 0x9FA5) + /* Hangul */ + || (ch >= 0xAC00 && ch <= 0xD7A3) + /* Digits */ + || (ch >= 0x0660 && ch <= 0x0669) + || (ch >= 0x06F0 && ch <= 0x06F9) + || (ch >= 0x0966 && ch <= 0x096F) + || (ch >= 0x09E6 && ch <= 0x09EF) + || (ch >= 0x0A66 && ch <= 0x0A6F) + || (ch >= 0x0AE6 && ch <= 0x0AEF) + || (ch >= 0x0B66 && ch <= 0x0B6F) + || (ch >= 0x0BE7 && ch <= 0x0BEF) + || (ch >= 0x0C66 && ch <= 0x0C6F) + || (ch >= 0x0CE6 && ch <= 0x0CEF) + || (ch >= 0x0D66 && ch <= 0x0D6F) + || (ch >= 0x0E50 && ch <= 0x0E59) + || (ch >= 0x0ED0 && ch <= 0x0ED9) + || (ch >= 0x0F20 && ch <= 0x0F33) + /* Special characters */ + || (ch == 0x00B5) + || (ch == 0x00B7) + || (ch >= 0x02B0 && ch <= 0x02B8) + || (ch == 0x02BB) + || (ch >= 0x02BD && ch <= 0x02C1) + || (ch >= 0x02D0 && ch <= 0x02D1) + || (ch >= 0x02E0 && ch <= 0x02E4) + || (ch == 0x037A) + || (ch == 0x0559) + || (ch == 0x093D) + || (ch == 0x0B3D) + || (ch == 0x1FBE) + || (ch >= 0x203F && ch <= 0x2040) + || (ch == 0x2102) + || (ch == 0x2107) + || (ch >= 0x210A && ch <= 0x2113) + || (ch == 0x2115) + || (ch >= 0x2118 && ch <= 0x211D) + || (ch == 0x2124) + || (ch == 0x2126) + || (ch == 0x2128) + || (ch >= 0x212A && ch <= 0x2131) + || (ch >= 0x2133 && ch <= 0x2138) + || (ch >= 0x2160 && ch <= 0x2182) + || (ch >= 0x3005 && ch <= 0x3007) + || (ch >= 0x3021 && ch <= 0x3029) + ) + return UC_IDENTIFIER_START; + return UC_IDENTIFIER_INVALID; +} + +/* The Java Language Specification, 3rd edition, §3.6. + http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#95710 */ +static bool +is_java_whitespace (unsigned int ch) +{ + return (ch == ' ' || ch == '\t' || ch == '\f' + || ch == '\n' || ch == '\r'); +} + +/* The Java Language Specification, 3rd edition, §3.8. + http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#40625 + and Character.isJavaIdentifierStart and Character.isJavaIdentifierPart */ +static int +java_ident_category (unsigned int ch) +{ + /* FIXME: Check this against Sun's JDK implementation. */ + if (is_category_L (ch) /* = Character.isLetter(ch) */ + || is_category_Nl (ch) /* = Character.getType(ch)==LETTER_NUMBER */ + || is_category_Sc (ch) /* currency symbol */ + || is_category_Pc (ch) /* connector punctuation */ + ) + return UC_IDENTIFIER_START; + if (is_category_Nd (ch) /* digit */ + || is_category_Mc (ch) /* combining mark */ + || is_category_Mn (ch) /* non-spacing mark */ + ) + return UC_IDENTIFIER_VALID; + if ((ch >= 0x0000 && ch <= 0x0008) + || (ch >= 0x000E && ch <= 0x001B) + || (ch >= 0x007F && ch <= 0x009F) + || is_category_Cf (ch) /* = Character.getType(ch)==FORMAT */ + ) + return UC_IDENTIFIER_IGNORABLE; + return UC_IDENTIFIER_INVALID; +} + +/* Construction of sparse 3-level tables. */ +#define TABLE identsyntax_table +#define ELEMENT uint8_t +#define DEFAULT UC_IDENTIFIER_INVALID +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output an identifier syntax categorization in a three-level bitmap. */ +static void +output_ident_category (const char *filename, int (*predicate) (unsigned int), const char *name, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct identsyntax_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Language syntax properties of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; /* or 8 */ + t.q = 5; /* or 4 */ + identsyntax_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int syntaxcode = predicate (ch); + if (syntaxcode != UC_IDENTIFIER_INVALID) + identsyntax_table_add (&t, ch, syntaxcode); + } + + identsyntax_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define identsyntax_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned short level3[%zu * %d];\n", t.level3_size, + (1 << t.p) * 2 / 16); + fprintf (stream, " }\n"); + fprintf (stream, "%s =\n", name); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + /* Pack the level3 array. Each entry needs 2 bits only. */ + fprintf (stream, " {"); + if ((t.level3_size << t.p) * 2 / 16 > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) * 2 / 16; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%04x", + (((uint8_t *) (t.result + level3_offset))[8 * i] << 0) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 1] << 2) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 2] << 4) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 3] << 6) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 4] << 8) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 5] << 10) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 6] << 12) + | (((uint8_t *) (t.result + level3_offset))[8 * i + 7] << 14)); + if (i+1 < (t.level3_size << t.p) * 2 / 16) + fprintf (stream, ","); + } + if ((t.level3_size << t.p) * 2 / 16 > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_ident_properties (const char *version) +{ +#define PROPERTY(P) \ + debug_output_predicate ("unictype/sy_" #P ".txt", is_ ## P); \ + output_predicate_test ("../tests/unictype/test-sy_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \ + output_predicate ("unictype/sy_" #P ".h", is_ ## P, "u_" #P, "Language syntax properties", version); + PROPERTY(c_whitespace) + PROPERTY(java_whitespace) +#undef PROPERTY + + output_ident_category ("unictype/sy_c_ident.h", c_ident_category, "u_c_ident", version); + output_ident_category ("unictype/sy_java_ident.h", java_ident_category, "u_java_ident", version); +} + +/* ========================================================================= */ + +/* Like ISO C <ctype.h> and <wctype.h>. Compatible to glibc's + glibc/localedata/locales/i18n file, generated by + glibc/localedata/gen-unicode-ctype.c. */ + +/* Character mappings. */ + +static unsigned int +to_upper (unsigned int ch) +{ + if (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].upper != NONE) + return unicode_attributes[ch].upper; + else + return ch; +} + +static unsigned int +to_lower (unsigned int ch) +{ + if (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].lower != NONE) + return unicode_attributes[ch].lower; + else + return ch; +} + +static unsigned int +to_title (unsigned int ch) +{ + if (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].title != NONE) + return unicode_attributes[ch].title; + else + return ch; +} + +/* Character class properties. */ + +static bool +is_upper (unsigned int ch) +{ + return (to_lower (ch) != ch); +} + +static bool +is_lower (unsigned int ch) +{ + return (to_upper (ch) != ch) + /* <U00DF> is lowercase, but without simple to_upper mapping. */ + || (ch == 0x00DF); +} + +static bool +is_alpha (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && ((unicode_attributes[ch].category[0] == 'L' + /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says + <U0E2F>, <U0E46> should belong to is_punct. */ + && (ch != 0x0E2F) && (ch != 0x0E46)) + /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says + <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */ + || (ch == 0x0E31) + || (ch >= 0x0E34 && ch <= 0x0E3A) + || (ch >= 0x0E47 && ch <= 0x0E4E) + /* Avoid warning for <U0345>. */ + || (ch == 0x0345) + /* Avoid warnings for <U2160>..<U217F>. */ + || (unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'l') + /* Avoid warnings for <U24B6>..<U24E9>. */ + || (unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'o' + && strstr (unicode_attributes[ch].name, " LETTER ") + != NULL) + /* Consider all the non-ASCII digits as alphabetic. + ISO C 99 forbids us to have them in category "digit", + but we want iswalnum to return true on them. */ + || (unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'd' + && !(ch >= 0x0030 && ch <= 0x0039)))); +} + +static bool +is_digit (unsigned int ch) +{ +#if 0 + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'd'); + /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without + a zero. Must add <0> in front of them by hand. */ +#else + /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99 + takes it away: + 7.25.2.1.5: + The iswdigit function tests for any wide character that corresponds + to a decimal-digit character (as defined in 5.2.1). + 5.2.1: + the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 + */ + return (ch >= 0x0030 && ch <= 0x0039); +#endif +} + +static bool +is_outdigit (unsigned int ch) +{ + return (ch >= 0x0030 && ch <= 0x0039); +} + +static bool +is_alnum (unsigned int ch) +{ + return is_alpha (ch) || is_digit (ch); +} + +static bool +is_blank (unsigned int ch) +{ + return (ch == 0x0009 /* '\t' */ + /* Category Zs without mention of "<noBreak>" */ + || (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && unicode_attributes[ch].category[1] == 's' + && !strstr (unicode_attributes[ch].decomposition, "<noBreak>"))); +} + +static bool +is_space (unsigned int ch) +{ + /* Don't make U+00A0 a space. Non-breaking space means that all programs + should treat it like a punctuation character, not like a space. */ + return (ch == 0x0020 /* ' ' */ + || ch == 0x000C /* '\f' */ + || ch == 0x000A /* '\n' */ + || ch == 0x000D /* '\r' */ + || ch == 0x0009 /* '\t' */ + || ch == 0x000B /* '\v' */ + /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */ + || (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'p' + || (unicode_attributes[ch].category[1] == 's' + && !strstr (unicode_attributes[ch].decomposition, + "<noBreak>"))))); +} + +static bool +is_cntrl (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && (strcmp (unicode_attributes[ch].name, "<control>") == 0 + /* Categories Zl and Zp */ + || (unicode_attributes[ch].category[0] == 'Z' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'p')))); +} + +static bool +is_xdigit (unsigned int ch) +{ +#if 0 + return is_digit (ch) + || (ch >= 0x0041 && ch <= 0x0046) + || (ch >= 0x0061 && ch <= 0x0066); +#else + /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 + takes it away: + 7.25.2.1.12: + The iswxdigit function tests for any wide character that corresponds + to a hexadecimal-digit character (as defined in 6.4.4.1). + 6.4.4.1: + hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F + */ + return (ch >= 0x0030 && ch <= 0x0039) + || (ch >= 0x0041 && ch <= 0x0046) + || (ch >= 0x0061 && ch <= 0x0066); +#endif +} + +static bool +is_graph (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && strcmp (unicode_attributes[ch].name, "<control>") + && !is_space (ch)); +} + +static bool +is_print (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && strcmp (unicode_attributes[ch].name, "<control>") + /* Categories Zl and Zp */ + && !(unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'Z' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'p'))); +} + +static bool +is_punct (unsigned int ch) +{ +#if 0 + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'P'); +#else + /* The traditional POSIX definition of punctuation is every graphic, + non-alphanumeric character. */ + return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch)); +#endif +} + +/* Output all properties. */ +static void +output_old_ctype (const char *version) +{ +#define PROPERTY(P) \ + debug_output_predicate ("unictype/ctype_" #P ".txt", is_ ## P); \ + output_predicate_test ("../tests/unictype/test-ctype_" #P ".c", is_ ## P, "uc_is_" #P " (c)"); \ + output_predicate ("unictype/ctype_" #P ".h", is_ ## P, "u_is_" #P, "ISO C <ctype.h> like properties", version); + PROPERTY(alnum) + PROPERTY(alpha) + PROPERTY(cntrl) + PROPERTY(digit) + PROPERTY(graph) + PROPERTY(lower) + PROPERTY(print) + PROPERTY(punct) + PROPERTY(space) + PROPERTY(upper) + PROPERTY(xdigit) + PROPERTY(blank) +#undef PROPERTY +} + +#if 0 + +static bool +is_combining (unsigned int ch) +{ + /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt + file. In 3.0.1 it was identical to the union of the general categories + "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the + PropList.txt file, so we take the latter definition. */ + return (unicode_attributes[ch].name != NULL + && unicode_attributes[ch].category[0] == 'M' + && (unicode_attributes[ch].category[1] == 'n' + || unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'e')); +} + +static bool +is_combining_level3 (unsigned int ch) +{ + return is_combining (ch) + && !(unicode_attributes[ch].combining[0] != '\0' + && unicode_attributes[ch].combining[0] != '0' + && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200); +} + +/* Return the UCS symbol string for a Unicode character. */ +static const char * +ucs_symbol (unsigned int i) +{ + static char buf[11+1]; + + sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i); + return buf; +} + +/* Return the UCS symbol range string for a Unicode characters interval. */ +static const char * +ucs_symbol_range (unsigned int low, unsigned int high) +{ + static char buf[24+1]; + + strcpy (buf, ucs_symbol (low)); + strcat (buf, ".."); + strcat (buf, ucs_symbol (high)); + return buf; +} + +/* Output a character class (= property) table. */ + +static void +output_charclass (FILE *stream, const char *classname, + bool (*func) (unsigned int)) +{ + char table[0x110000]; + unsigned int i; + bool need_semicolon; + const int max_column = 75; + int column; + + for (i = 0; i < 0x110000; i++) + table[i] = (int) func (i); + + fprintf (stream, "%s ", classname); + need_semicolon = false; + column = 1000; + for (i = 0; i < 0x110000; ) + { + if (!table[i]) + i++; + else + { + unsigned int low, high; + char buf[25]; + + low = i; + do + i++; + while (i < 0x110000 && table[i]); + high = i - 1; + + if (low == high) + strcpy (buf, ucs_symbol (low)); + else + strcpy (buf, ucs_symbol_range (low, high)); + + if (need_semicolon) + { + fprintf (stream, ";"); + column++; + } + + if (column + strlen (buf) > max_column) + { + fprintf (stream, "/\n "); + column = 3; + } + + fprintf (stream, "%s", buf); + column += strlen (buf); + need_semicolon = true; + } + } + fprintf (stream, "\n"); +} + +/* Output a character mapping table. */ + +static void +output_charmap (FILE *stream, const char *mapname, + unsigned int (*func) (unsigned int)) +{ + char table[0x110000]; + unsigned int i; + bool need_semicolon; + const int max_column = 75; + int column; + + for (i = 0; i < 0x110000; i++) + table[i] = (func (i) != i); + + fprintf (stream, "%s ", mapname); + need_semicolon = false; + column = 1000; + for (i = 0; i < 0x110000; i++) + if (table[i]) + { + char buf[25+1]; + + strcpy (buf, "("); + strcat (buf, ucs_symbol (i)); + strcat (buf, ","); + strcat (buf, ucs_symbol (func (i))); + strcat (buf, ")"); + + if (need_semicolon) + { + fprintf (stream, ";"); + column++; + } + + if (column + strlen (buf) > max_column) + { + fprintf (stream, "/\n "); + column = 3; + } + + fprintf (stream, "%s", buf); + column += strlen (buf); + need_semicolon = true; + } + fprintf (stream, "\n"); +} + +/* Output the width table. */ + +static void +output_widthmap (FILE *stream) +{ +} + +/* Output the tables to the given file. */ + +static void +output_tables (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "escape_char /\n"); + fprintf (stream, "comment_char %%\n"); + fprintf (stream, "\n"); + fprintf (stream, "%% Generated automatically by gen-uni-tables.c for Unicode %s.\n", + version); + fprintf (stream, "\n"); + + fprintf (stream, "LC_IDENTIFICATION\n"); + fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version); + fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n"); + fprintf (stream, "address \"\"\n"); + fprintf (stream, "contact \"\"\n"); + fprintf (stream, "email \"bug-glibc@gnu.org\"\n"); + fprintf (stream, "tel \"\"\n"); + fprintf (stream, "fax \"\"\n"); + fprintf (stream, "language \"\"\n"); + fprintf (stream, "territory \"Earth\"\n"); + fprintf (stream, "revision \"%s\"\n", version); + { + time_t now; + char date[11]; + now = time (NULL); + strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now)); + fprintf (stream, "date \"%s\"\n", date); + } + fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n"); + fprintf (stream, "END LC_IDENTIFICATION\n"); + fprintf (stream, "\n"); + + /* Verifications. */ + for (ch = 0; ch < 0x110000; ch++) + { + /* toupper restriction: "Only characters specified for the keywords + lower and upper shall be specified. */ + if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch))) + fprintf (stderr, + "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", + ucs_symbol (ch), ch, to_upper (ch)); + + /* tolower restriction: "Only characters specified for the keywords + lower and upper shall be specified. */ + if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch))) + fprintf (stderr, + "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", + ucs_symbol (ch), ch, to_lower (ch)); + + /* alpha restriction: "Characters classified as either upper or lower + shall automatically belong to this class. */ + if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch)) + fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); + + /* alpha restriction: "No character specified for the keywords cntrl, + digit, punct or space shall be specified." */ + if (is_alpha (ch) && is_cntrl (ch)) + fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); + if (is_alpha (ch) && is_digit (ch)) + fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); + if (is_alpha (ch) && is_punct (ch)) + fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); + if (is_alpha (ch) && is_space (ch)) + fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); + + /* space restriction: "No character specified for the keywords upper, + lower, alpha, digit, graph or xdigit shall be specified." + upper, lower, alpha already checked above. */ + if (is_space (ch) && is_digit (ch)) + fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); + if (is_space (ch) && is_graph (ch)) + fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); + if (is_space (ch) && is_xdigit (ch)) + fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); + + /* cntrl restriction: "No character specified for the keywords upper, + lower, alpha, digit, punct, graph, print or xdigit shall be + specified." upper, lower, alpha already checked above. */ + if (is_cntrl (ch) && is_digit (ch)) + fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); + if (is_cntrl (ch) && is_punct (ch)) + fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); + if (is_cntrl (ch) && is_graph (ch)) + fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); + if (is_cntrl (ch) && is_print (ch)) + fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); + if (is_cntrl (ch) && is_xdigit (ch)) + fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); + + /* punct restriction: "No character specified for the keywords upper, + lower, alpha, digit, cntrl, xdigit or as the <space> character shall + be specified." upper, lower, alpha, cntrl already checked above. */ + if (is_punct (ch) && is_digit (ch)) + fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); + if (is_punct (ch) && is_xdigit (ch)) + fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); + if (is_punct (ch) && (ch == 0x0020)) + fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); + + /* graph restriction: "No character specified for the keyword cntrl + shall be specified." Already checked above. */ + + /* print restriction: "No character specified for the keyword cntrl + shall be specified." Already checked above. */ + + /* graph - print relation: differ only in the <space> character. + How is this possible if there are more than one space character?! + I think susv2/xbd/locale.html should speak of "space characters", + not "space character". */ + if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch))) + fprintf (stderr, + "%s is print but not graph|<space>\n", ucs_symbol (ch)); + if (!is_print (ch) && (is_graph (ch) || ch == 0x0020)) + fprintf (stderr, + "%s is graph|<space> but not print\n", ucs_symbol (ch)); + } + + fprintf (stream, "LC_CTYPE\n"); + output_charclass (stream, "upper", is_upper); + output_charclass (stream, "lower", is_lower); + output_charclass (stream, "alpha", is_alpha); + output_charclass (stream, "digit", is_digit); + output_charclass (stream, "outdigit", is_outdigit); + output_charclass (stream, "blank", is_blank); + output_charclass (stream, "space", is_space); + output_charclass (stream, "cntrl", is_cntrl); + output_charclass (stream, "punct", is_punct); + output_charclass (stream, "xdigit", is_xdigit); + output_charclass (stream, "graph", is_graph); + output_charclass (stream, "print", is_print); + output_charclass (stream, "class \"combining\";", is_combining); + output_charclass (stream, "class \"combining_level3\";", is_combining_level3); + output_charmap (stream, "toupper", to_upper); + output_charmap (stream, "tolower", to_lower); + output_charmap (stream, "map \"totitle\";", to_title); + output_widthmap (stream); + fprintf (stream, "END LC_CTYPE\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +#endif + +/* ========================================================================= */ + +/* The width property from the EastAsianWidth.txt file. + Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ +const char * unicode_width[0x110000]; + +/* Stores in unicode_width[] the width property from the EastAsianWidth.txt + file. */ +static void +fill_width (const char *width_filename) +{ + unsigned int i, j; + FILE *stream; + char field0[FIELDLEN]; + char field1[FIELDLEN]; + char field2[FIELDLEN]; + int lineno = 0; + + for (i = 0; i < 0x110000; i++) + unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); + + stream = fopen (width_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", width_filename); + exit (1); + } + + for (;;) + { + int n; + int c; + + lineno++; + c = getc (stream); + if (c == EOF) + break; + if (c == '#') + { + do c = getc (stream); while (c != EOF && c != '\n'); + continue; + } + ungetc (c, stream); + n = getfield (stream, field0, ';'); + n += getfield (stream, field1, ' '); + n += getfield (stream, field2, '\n'); + if (n == 0) + break; + if (n != 3) + { + fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); + exit (1); + } + i = strtoul (field0, NULL, 16); + if (strstr (field0, "..") != NULL) + { + /* Deal with a range. */ + j = strtoul (strstr (field0, "..") + 2, NULL, 16); + for (; i <= j; i++) + unicode_width[i] = strdup (field1); + } + else + { + /* Single character line. */ + unicode_width[i] = strdup (field1); + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", width_filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Non-spacing attribute and width. */ + +/* The non-spacing attribute table consists of: + - Non-spacing characters; generated from PropList.txt or + "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" + - Format control characters; generated from + "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" + - Zero width characters; generated from + "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" + */ + +static bool +is_nonspacing (unsigned int ch) +{ + return (unicode_attributes[ch].name != NULL + && (get_bidi_category (ch) == UC_BIDI_NSM + || is_category_Cc (ch) || is_category_Cf (ch) + || strncmp (unicode_attributes[ch].name, "ZERO WIDTH ", 11) == 0)); +} + +static void +output_nonspacing_property (const char *filename) +{ + FILE *stream; + int ind[0x110000 / 0x200]; + unsigned int i; + unsigned int i_max; + int next_ind; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + next_ind = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = false; + unsigned int ch; + + if (i != 0xe0000 / 0x200) /* The 0xe0000 block is handled by code. */ + for (ch = i * 0x200; ch < (i + 1) * 0x200; ch++) + if (is_nonspacing (ch)) + { + nontrivial = true; + break; + } + if (nontrivial) + ind[i] = next_ind++; + else + ind[i] = -1; + } + + fprintf (stream, "static const unsigned char nonspacing_table_data[%d*64] = {\n", + next_ind); + i_max = 0; + for (i = 0; i < 0x110000 / 0x200; i++) + { + bool nontrivial = (ind[i] >= 0); + + if (nontrivial) + { + unsigned int j; + + fprintf (stream, " /* 0x%04x-0x%04x */\n", i * 0x200, (i + 1) * 0x200 - 1); + for (j = 0; j < 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + unsigned int l; + unsigned char bits = 0; + + for (l = 0; l < 8; l++) + { + unsigned int ch = i * 0x200 + j * 0x40 + k * 8 + l; + + if (is_nonspacing (ch)) + bits |= 1 << l; + } + fprintf (stream, " 0x%02x%c", bits, + ind[i] + 1 == next_ind && j == 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + i * 0x200 + j * 0x40, i * 0x200 + (j + 1) * 0x40 - 1); + } + i_max = i; + } + } + fprintf (stream, "};\n"); + + i_max = ((i_max + 8 - 1) / 8) * 8; + fprintf (stream, "static const signed char nonspacing_table_ind[%u] = {\n", + i_max); + { + unsigned int j; + + for (j = 0; j < i_max / 8; j++) + { + unsigned int k; + + fprintf (stream, " "); + for (k = 0; k < 8; k++) + { + i = j * 8 + k; + fprintf (stream, " %2d%c", ind[i], + j == i_max / 8 - 1 && k == 8 - 1 ? ' ' : ','); + } + fprintf (stream, " /* 0x%04x-0x%04x */\n", + j * 8 * 0x200, (j + 1) * 8 * 0x200 - 1); + } + } + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Returns the width of ch as one of 0, '0', '1', '2', 'A'. */ +static char +symbolic_width (unsigned int ch) +{ + /* Test for unassigned character. */ + if (is_property_unassigned_code_value (ch)) + { + /* Unicode TR#11 section "Unassigned and Private-Use Characters". */ + if (ch >= 0xE000 && ch <= 0xF8FF) /* Private Use */ + return 'A'; + if ((ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs block */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A block */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs block */ + || (ch >= 0x20000 && ch <= 0x2FFFF) /* Supplementary Ideographic Plane */ + || (ch >= 0x30000 && ch <= 0x3FFFF) /* Tertiary Ideographic Plane */) + return '2'; + return 0; + } + else + { + /* Test for non-spacing or control character. */ + if (is_category_Cc (ch) && ch < 0x00A0) + return 0; + if (is_nonspacing (ch)) + return '0'; + /* Test for double-width character. */ + if (unicode_width[ch] != NULL + && (strcmp (unicode_width[ch], "W") == 0 + || strcmp (unicode_width[ch], "F") == 0)) + return '2'; + /* Test for half-width character. */ + if (unicode_width[ch] != NULL + && strcmp (unicode_width[ch], "H") == 0) + return '1'; + } + /* In ancient CJK encodings, Cyrillic and most other characters are + double-width as well. */ + if (ch >= 0x00A1 && ch < 0x10000) + return 'A'; + return '1'; +} + +static void +output_width_property_test (const char *filename) +{ + FILE *stream; + unsigned int interval_start, interval_end, ch; + char interval_value; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + interval_value = 0; + interval_start = interval_end = 0; /* avoid GCC warning */ + for (ch = 0; ch < 0x110000; ch++) + { + char value = symbolic_width (ch); + if (value != 0) /* skip Cc control characters and unassigned characters */ + { + if (value == interval_value) + /* Extend the interval. */ + interval_end = ch; + else + { + /* Terminate the interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); + else + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); + } + /* Start a new interval. */ + interval_start = interval_end = ch; + interval_value = value; + } + } + } + /* Terminate the last interval. */ + if (interval_value != 0) + { + if (interval_end == interval_start) + fprintf (stream, "%04X\t\t%c\n", interval_start, interval_value); + else + fprintf (stream, "%04X..%04X\t%c\n", interval_start, interval_end, interval_value); + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Line breaking classification. + Updated for Unicode TR #14 revision 26. */ + +enum +{ + /* Values >= 25 are resolved at run time. */ + LBP_BK = 25, /* mandatory break */ +/*LBP_CR, carriage return - not used here because it's a DOSism */ +/*LBP_LF, line feed - not used here because it's a DOSism */ + LBP_CM = 26, /* attached characters and combining marks */ +/*LBP_NL, next line - not used here because it's equivalent to LBP_BK */ +/*LBP_SG, surrogates - not used here because they are not characters */ + LBP_WJ = 0, /* word joiner */ + LBP_ZW = 27, /* zero width space */ + LBP_GL = 1, /* non-breaking (glue) */ + LBP_SP = 28, /* space */ + LBP_B2 = 2, /* break opportunity before and after */ + LBP_BA = 3, /* break opportunity after */ + LBP_BB = 4, /* break opportunity before */ + LBP_HY = 5, /* hyphen */ + LBP_CB = 29, /* contingent break opportunity */ + LBP_CL = 6, /* closing punctuation */ + LBP_CP = 7, /* closing parenthesis */ + LBP_EX = 8, /* exclamation/interrogation */ + LBP_IN = 9, /* inseparable */ + LBP_NS = 10, /* non starter */ + LBP_OP = 11, /* opening punctuation */ + LBP_QU = 12, /* ambiguous quotation */ + LBP_IS = 13, /* infix separator (numeric) */ + LBP_NU = 14, /* numeric */ + LBP_PO = 15, /* postfix (numeric) */ + LBP_PR = 16, /* prefix (numeric) */ + LBP_SY = 17, /* symbols allowing breaks */ + LBP_AI = 30, /* ambiguous (alphabetic or ideograph) */ + LBP_AL = 18, /* ordinary alphabetic and symbol characters */ + LBP_H2 = 19, /* Hangul LV syllable */ + LBP_H3 = 20, /* Hangul LVT syllable */ + LBP_ID = 21, /* ideographic */ + LBP_JL = 22, /* Hangul L Jamo */ + LBP_JV = 23, /* Hangul V Jamo */ + LBP_JT = 24, /* Hangul T Jamo */ + LBP_SA = 31, /* complex context (South East Asian) */ + LBP_XX = 32 /* unknown */ +}; + +/* Returns the line breaking classification for ch, as a bit mask. */ +static int64_t +get_lbp (unsigned int ch) +{ + int64_t attr = 0; + + if (unicode_attributes[ch].name != NULL) + { + /* mandatory break */ + if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */ + || ch == 0x000C /* form feed */ + || ch == 0x000B /* line tabulation */ + || ch == 0x2028 /* LINE SEPARATOR */ + || ch == 0x2029 /* PARAGRAPH SEPARATOR */) + attr |= (int64_t) 1 << LBP_BK; + + if (ch == 0x2060 /* WORD JOINER */ + || ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */) + attr |= (int64_t) 1 << LBP_WJ; + + /* zero width space */ + if (ch == 0x200B /* ZERO WIDTH SPACE */) + attr |= (int64_t) 1 << LBP_ZW; + + /* non-breaking (glue) */ + if (ch == 0x00A0 /* NO-BREAK SPACE */ + || ch == 0x202F /* NARROW NO-BREAK SPACE */ + || ch == 0x180E /* MONGOLIAN VOWEL SEPARATOR */ + || ch == 0x034F /* COMBINING GRAPHEME JOINER */ + || ch == 0x2007 /* FIGURE SPACE */ + || ch == 0x2011 /* NON-BREAKING HYPHEN */ + || ch == 0x0F08 /* TIBETAN MARK SBRUL SHAD */ + || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */ + || ch == 0x0F12 /* TIBETAN MARK RGYA GRAM SHAD */ + || (ch >= 0x035C && ch <= 0x0362) /* COMBINING DOUBLE ... */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0FD9 /* TIBETAN MARK LEADING MCHAN RTAGS */ + || ch == 0x0FDA /* TIBETAN MARK TRAILING MCHAN RTAGS */) + attr |= (int64_t) 1 << LBP_GL; + + /* space */ + if (ch == 0x0020 /* SPACE */) + attr |= (int64_t) 1 << LBP_SP; + + /* break opportunity before and after */ + if (ch == 0x2014 /* EM DASH */) + attr |= (int64_t) 1 << LBP_B2; + + /* break opportunity after */ + if (/* Breaking Spaces */ + ch == 0x1680 /* OGHAM SPACE MARK */ + || ch == 0x2000 /* EN QUAD */ + || ch == 0x2001 /* EM QUAD */ + || ch == 0x2002 /* EN SPACE */ + || ch == 0x2003 /* EM SPACE */ + || ch == 0x2004 /* THREE-PER-EM SPACE */ + || ch == 0x2005 /* FOUR-PER-EM SPACE */ + || ch == 0x2006 /* SIX-PER-EM SPACE */ + || ch == 0x2008 /* PUNCTUATION SPACE */ + || ch == 0x2009 /* THIN SPACE */ + || ch == 0x200A /* HAIR SPACE */ + || ch == 0x205F /* MEDIUM MATHEMATICAL SPACE */ + /* Tabs */ + || ch == 0x0009 /* tab */ + /* Conditional Hyphens */ + || ch == 0x00AD /* SOFT HYPHEN */ + /* Breaking Hyphens */ + || ch == 0x058A /* ARMENIAN HYPHEN */ + || ch == 0x1400 /* CANADIAN SYLLABICS HYPHEN */ + || ch == 0x2010 /* HYPHEN */ + || ch == 0x2012 /* FIGURE DASH */ + || ch == 0x2013 /* EN DASH */ + /* Visible Word Dividers */ + || ch == 0x05BE /* HEBREW PUNCTUATION MAQAF */ + || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ + || ch == 0x1361 /* ETHIOPIC WORDSPACE */ + || ch == 0x17D8 /* KHMER SIGN BEYYAL */ + || ch == 0x17DA /* KHMER SIGN KOOMUUT */ + || ch == 0x2027 /* HYPHENATION POINT */ + || ch == 0x007C /* VERTICAL LINE */ + /* Historic Word Separators */ + || ch == 0x16EB /* RUNIC SINGLE PUNCTUATION */ + || ch == 0x16EC /* RUNIC MULTIPLE PUNCTUATION */ + || ch == 0x16ED /* RUNIC CROSS PUNCTUATION */ + || ch == 0x2056 /* THREE DOT PUNCTUATION */ + || ch == 0x2058 /* FOUR DOT PUNCTUATION */ + || ch == 0x2059 /* FIVE DOT PUNCTUATION */ + || ch == 0x205A /* TWO DOT PUNCTUATION */ + || ch == 0x205B /* FOUR DOT MARK */ + || ch == 0x205D /* TRICOLON */ + || ch == 0x205E /* VERTICAL FOUR DOTS */ + || ch == 0x2E19 /* PALM BRANCH */ + || ch == 0x2E2A /* TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x2E2B /* ONE DOT OVER TWO DOTS PUNCTUATION */ + || ch == 0x2E2C /* SQUARED FOUR DOT PUNCTUATION */ + || ch == 0x2E2D /* FIVE DOT PUNCTUATION */ + || ch == 0x2E30 /* RING POINT */ + || ch == 0x2E31 /* WORD SEPARATOR MIDDLE DOT */ + || ch == 0x10100 /* AEGEAN WORD SEPARATOR LINE */ + || ch == 0x10101 /* AEGEAN WORD SEPARATOR DOT */ + || ch == 0x10102 /* AEGEAN CHECK MARK */ + || ch == 0x1039F /* UGARITIC WORD DIVIDER */ + || ch == 0x103D0 /* OLD PERSIAN WORD DIVIDER */ + || ch == 0x1091F /* PHOENICIAN WORD SEPARATOR */ + || ch == 0x12470 /* CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER */ + /* Dandas */ + || ch == 0x0964 /* DEVANAGARI DANDA */ + || ch == 0x0965 /* DEVANAGARI DOUBLE DANDA */ + || ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ + || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ + || ch == 0x104A /* MYANMAR SIGN LITTLE SECTION */ + || ch == 0x104B /* MYANMAR SIGN SECTION */ + || ch == 0x1735 /* PHILIPPINE SINGLE PUNCTUATION */ + || ch == 0x1736 /* PHILIPPINE DOUBLE PUNCTUATION */ + || ch == 0x17D4 /* KHMER SIGN KHAN */ + || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ + || ch == 0x1B5E /* BALINESE CARIK SIKI */ + || ch == 0x1B5F /* BALINESE CARIK PAREREN */ + || ch == 0xA8CE /* SAURASHTRA DANDA */ + || ch == 0xA8CF /* SAURASHTRA DOUBLE DANDA */ + || ch == 0xAA5D /* CHAM PUNCTUATION DANDA */ + || ch == 0xAA5E /* CHAM PUNCTUATION DOUBLE DANDA */ + || ch == 0xAA5F /* CHAM PUNCTUATION TRIPLE DANDA */ + || ch == 0x10A56 /* KHAROSHTHI PUNCTUATION DANDA */ + || ch == 0x10A57 /* KHAROSHTHI PUNCTUATION DOUBLE DANDA */ + /* Tibetan */ + || ch == 0x0F34 /* TIBETAN MARK BSDUS RTAGS */ + || ch == 0x0F7F /* TIBETAN SIGN RNAM BCAD */ + || ch == 0x0F85 /* TIBETAN MARK PALUTA */ + || ch == 0x0FBE /* TIBETAN KU RU KHA */ + || ch == 0x0FBF /* TIBETAN KU RU KHA BZHI MIG CAN */ + || ch == 0x0FD2 /* TIBETAN MARK NYIS TSHEG */ + /* Other Terminating Punctuation */ + || ch == 0x1804 /* MONGOLIAN COLON */ + || ch == 0x1805 /* MONGOLIAN FOUR DOTS */ + || ch == 0x1B5A /* BALINESE PANTI */ + || ch == 0x1B5B /* BALINESE PAMADA */ + || ch == 0x1B5D /* BALINESE CARIK PAMUNGKAH */ + || ch == 0x1B60 /* BALINESE PAMENENG */ + || ch == 0x1C3B /* LEPCHA PUNCTUATION TA-ROL */ + || ch == 0x1C3C /* LEPCHA PUNCTUATION NYET THYOOM TA-ROL */ + || ch == 0x1C3D /* LEPCHA PUNCTUATION CER-WA */ + || ch == 0x1C3E /* LEPCHA PUNCTUATION TSHOOK CER-WA */ + || ch == 0x1C3F /* LEPCHA PUNCTUATION TSHOOK */ + || ch == 0x1C7E /* OL CHIKI PUNCTUATION MUCAAD */ + || ch == 0x1C7F /* OL CHIKI PUNCTUATION DOUBLE MUCAAD */ + || ch == 0x2CFA /* COPTIC OLD NUBIAN DIRECT QUESTION MARK */ + || ch == 0x2CFB /* COPTIC OLD NUBIAN INDIRECT QUESTION MARK */ + || ch == 0x2CFC /* COPTIC OLD NUBIAN VERSE DIVIDER */ + || ch == 0x2CFF /* COPTIC MORPHOLOGICAL DIVIDER */ + || (ch >= 0x2E0E && ch <= 0x2E15) /* EDITORIAL CORONIS .. UPWARDS ANCORA */ + || ch == 0x2E17 /* DOUBLE OBLIQUE HYPHEN */ + || ch == 0xA60D /* VAI COMMA */ + || ch == 0xA60F /* VAI QUESTION MARK */ + || ch == 0xA92E /* KAYAH LI SIGN CWI */ + || ch == 0xA92F /* KAYAH LI SIGN SHYA */ + || ch == 0x10A50 /* KHAROSHTHI PUNCTUATION DOT */ + || ch == 0x10A51 /* KHAROSHTHI PUNCTUATION SMALL CIRCLE */ + || ch == 0x10A52 /* KHAROSHTHI PUNCTUATION CIRCLE */ + || ch == 0x10A53 /* KHAROSHTHI PUNCTUATION CRESCENT BAR */ + || ch == 0x10A54 /* KHAROSHTHI PUNCTUATION MANGALAM */ + || ch == 0x10A55 /* KHAROSHTHI PUNCTUATION LOTUS */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x2D70 /* TIFINAGH SEPARATOR MARK */ + || ch == 0xA4FE /* LISU PUNCTUATION COMMA */ + || ch == 0xA4FF /* LISU PUNCTUATION FULL STOP */ + || ch == 0xA6F3 /* BAMUM FULL STOP */ + || ch == 0xA6F4 /* BAMUM COLON */ + || ch == 0xA6F5 /* BAMUM COMMA */ + || ch == 0xA6F6 /* BAMUM SEMICOLON */ + || ch == 0xA6F7 /* BAMUM QUESTION MARK */ + || ch == 0xA9C7 /* JAVANESE PADA PANGKAT */ + || ch == 0xA9C8 /* JAVANESE PADA LINGSA */ + || ch == 0xA9C9 /* JAVANESE PADA LUNGSI */ + || ch == 0xABEB /* MEETEI MAYEK CHEIKHEI */ + || ch == 0x10857 /* IMPERIAL ARAMAIC SECTION SIGN */ + || ch == 0x10B39 /* AVESTAN ABBREVIATION MARK */ + || ch == 0x10B3A /* TINY TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3B /* SMALL TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3C /* LARGE TWO DOTS OVER ONE DOT PUNCTUATION */ + || ch == 0x10B3D /* LARGE ONE DOT OVER TWO DOTS PUNCTUATION */ + || ch == 0x10B3E /* LARGE TWO RINGS OVER ONE RING PUNCTUATION */ + || ch == 0x10B3F /* LARGE ONE RING OVER TWO RINGS PUNCTUATION */ + || ch == 0x11047 /* BRAHMI DANDA */ + || ch == 0x11048 /* BRAHMI DOUBLE DANDA */ + || ch == 0x110BE /* KAITHI SECTION MARK */ + || ch == 0x110BF /* KAITHI DOUBLE SECTION MARK */ + || ch == 0x110C0 /* KAITHI DANDA */ + || ch == 0x110C1 /* KAITHI DOUBLE DANDA */ + || ch == 0x12471 /* CUNEIFORM PUNCTUATION SIGN VERTICAL COLON */ + || ch == 0x12472 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON */ + || ch == 0x12473 /* CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON */) + attr |= (int64_t) 1 << LBP_BA; + + /* break opportunity before */ + if (ch == 0x00B4 /* ACUTE ACCENT */ + || ch == 0x1FFD /* GREEK OXIA */ + || ch == 0x02DF /* MODIFIER LETTER CROSS ACCENT */ + || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ + || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ + || ch == 0x0F01 /* TIBETAN MARK GTER YIG MGO TRUNCATED A */ + || ch == 0x0F02 /* TIBETAN MARK GTER YIG MGO -UM RNAM BCAD MA */ + || ch == 0x0F03 /* TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA */ + || ch == 0x0F04 /* TIBETAN MARK INITIAL YIG MGO MDUN MA */ + || ch == 0x0F06 /* TIBETAN MARK CARET YIG MGO PHUR SHAD MA */ + || ch == 0x0F07 /* TIBETAN MARK YIG MGO TSHEG SHAD MA */ + || ch == 0x0F09 /* TIBETAN MARK BSKUR YIG MGO */ + || ch == 0x0F0A /* TIBETAN MARK BKA- SHOG YIG MGO */ + || ch == 0x0FD0 /* TIBETAN MARK BSKA- SHOG GI MGO RGYAN */ + || ch == 0x0FD1 /* TIBETAN MARK MNYAM YIG GI MGO RGYAN */ + || ch == 0x0FD3 /* TIBETAN MARK INITIAL BRDA RNYING YIG MGO MDUN MA */ + || ch == 0xA874 /* PHAGS-PA SINGLE HEAD MARK */ + || ch == 0xA875 /* PHAGS-PA DOUBLE HEAD MARK */ + || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) + attr |= (int64_t) 1 << LBP_BB; + + /* hyphen */ + if (ch == 0x002D /* HYPHEN-MINUS */) + attr |= (int64_t) 1 << LBP_HY; + + /* contingent break opportunity */ + if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) + attr |= (int64_t) 1 << LBP_CB; + + /* closing parenthesis */ + if (ch == 0x0029 /* RIGHT PARENTHESIS */ + || ch == 0x005D /* RIGHT SQUARE BRACKET */) + attr |= (int64_t) 1 << LBP_CP; + + /* closing punctuation */ + if ((unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 'e' + && !(attr & ((int64_t) 1 << LBP_CP))) + || ch == 0x3001 /* IDEOGRAPHIC COMMA */ + || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ + || ch == 0xFE11 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA */ + || ch == 0xFE12 /* PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP */ + || ch == 0xFE50 /* SMALL COMMA */ + || ch == 0xFE52 /* SMALL FULL STOP */ + || ch == 0xFF0C /* FULLWIDTH COMMA */ + || ch == 0xFF0E /* FULLWIDTH FULL STOP */ + || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ + || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x1325B /* EGYPTIAN HIEROGLYPH O006D */ + || ch == 0x1325C /* EGYPTIAN HIEROGLYPH O006E */ + || ch == 0x1325D /* EGYPTIAN HIEROGLYPH O006F */ + || ch == 0x13282 /* EGYPTIAN HIEROGLYPH O033A */ + || ch == 0x13287 /* EGYPTIAN HIEROGLYPH O036B */ + || ch == 0x13289 /* EGYPTIAN HIEROGLYPH O036D */ + || ch == 0x1337A /* EGYPTIAN HIEROGLYPH V011B */ + || ch == 0x1337B /* EGYPTIAN HIEROGLYPH V011C */) + attr |= (int64_t) 1 << LBP_CL; + + /* exclamation/interrogation */ + if (ch == 0x0021 /* EXCLAMATION MARK */ + || ch == 0x003F /* QUESTION MARK */ + || ch == 0x05C6 /* HEBREW PUNCTUATION NUN HAFUKHA */ + || ch == 0x061B /* ARABIC SEMICOLON */ + || ch == 0x061E /* ARABIC TRIPLE DOT PUNCTUATION MARK */ + || ch == 0x061F /* ARABIC QUESTION MARK */ + || ch == 0x06D4 /* ARABIC FULL STOP */ + || ch == 0x07F9 /* NKO EXCLAMATION MARK */ + || ch == 0x0F0D /* TIBETAN MARK SHAD */ + || ch == 0x0F0E /* TIBETAN MARK NYIS SHAD */ + || ch == 0x0F0F /* TIBETAN MARK TSHEG SHAD */ + || ch == 0x0F10 /* TIBETAN MARK NYIS TSHEG SHAD */ + || ch == 0x0F11 /* TIBETAN MARK RIN CHEN SPUNGS SHAD */ + || ch == 0x0F14 /* TIBETAN MARK GTER TSHEG */ + || ch == 0x1802 /* MONGOLIAN COMMA */ + || ch == 0x1803 /* MONGOLIAN FULL STOP */ + || ch == 0x1808 /* MONGOLIAN MANCHU COMMA */ + || ch == 0x1809 /* MONGOLIAN MANCHU FULL STOP */ + || ch == 0x1944 /* LIMBU EXCLAMATION MARK */ + || ch == 0x1945 /* LIMBU QUESTION MARK */ + || ch == 0x2762 /* HEAVY EXCLAMATION MARK ORNAMENT */ + || ch == 0x2763 /* HEAVY HEART EXCLAMATION MARK ORNAMENT */ + || ch == 0x2CF9 /* COPTIC OLD NUBIAN FULL STOP */ + || ch == 0x2CFE /* COPTIC FULL STOP */ + || ch == 0x2E2E /* REVERSED QUESTION MARK */ + || ch == 0xA60E /* VAI FULL STOP */ + || ch == 0xA876 /* PHAGS-PA MARK SHAD */ + || ch == 0xA877 /* PHAGS-PA MARK DOUBLE SHAD */ + || ch == 0xFE15 /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */ + || ch == 0xFE16 /* PRESENTATION FORM FOR VERTICAL QUESTION MARK */ + || ch == 0xFE56 /* SMALL QUESTION MARK */ + || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ + || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ + || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) + attr |= (int64_t) 1 << LBP_EX; + + /* inseparable */ + if (ch == 0x2024 /* ONE DOT LEADER */ + || ch == 0x2025 /* TWO DOT LEADER */ + || ch == 0x2026 /* HORIZONTAL ELLIPSIS */ + || ch == 0xFE19 /* PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS */) + attr |= (int64_t) 1 << LBP_IN; + + /* non starter */ + if (ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ + || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ + || ch == 0x203D /* INTERROBANG */ + || ch == 0x2047 /* DOUBLE QUESTION MARK */ + || ch == 0x2048 /* QUESTION EXCLAMATION MARK */ + || ch == 0x2049 /* EXCLAMATION QUESTION MARK */ + || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ + || ch == 0x301C /* WAVE DASH */ + || ch == 0x303C /* MASU MARK */ + || ch == 0x303B /* VERTICAL IDEOGRAPHIC ITERATION MARK */ + || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ + || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ + || ch == 0x309D /* HIRAGANA ITERATION MARK */ + || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ + || ch == 0x30A0 /* KATAKANA-HIRAGANA DOUBLE HYPHEN */ + || ch == 0x30FB /* KATAKANA MIDDLE DOT */ + || ch == 0x30FC /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + || ch == 0x30FD /* KATAKANA ITERATION MARK */ + || ch == 0x30FE /* KATAKANA VOICED ITERATION MARK */ + || ch == 0xA015 /* YI SYLLABLE WU */ + || ch == 0xFE54 /* SMALL SEMICOLON */ + || ch == 0xFE55 /* SMALL COLON */ + || ch == 0xFF1A /* FULLWIDTH COLON */ + || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ + || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ + || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + || ch == 0xFF9E /* HALFWIDTH KATAKANA VOICED SOUND MARK */ + || ch == 0xFF9F /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */ + || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL + || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) + attr |= (int64_t) 1 << LBP_NS; + + /* opening punctuation */ + if ((unicode_attributes[ch].category[0] == 'P' + && unicode_attributes[ch].category[1] == 's') + || ch == 0x00A1 /* INVERTED EXCLAMATION MARK */ + || ch == 0x00BF /* INVERTED QUESTION MARK */ + || ch == 0x2E18 /* INVERTED INTERROBANG */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x13258 /* EGYPTIAN HIEROGLYPH O006A */ + || ch == 0x13259 /* EGYPTIAN HIEROGLYPH O006B */ + || ch == 0x1325A /* EGYPTIAN HIEROGLYPH O006C */ + || ch == 0x13286 /* EGYPTIAN HIEROGLYPH O036A */ + || ch == 0x13288 /* EGYPTIAN HIEROGLYPH O036C */ + || ch == 0x13379 /* EGYPTIAN HIEROGLYPH V011A */) + attr |= (int64_t) 1 << LBP_OP; + + /* ambiguous quotation */ + if ((unicode_attributes[ch].category[0] == 'P' + && (unicode_attributes[ch].category[1] == 'f' + || unicode_attributes[ch].category[1] == 'i')) + || ch == 0x0022 /* QUOTATION MARK */ + || ch == 0x0027 /* APOSTROPHE */ + || ch == 0x275B /* HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x275C /* HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x275D /* HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x275E /* HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT */ + || ch == 0x2E00 /* RIGHT ANGLE SUBSTITUTION MARKER */ + || ch == 0x2E01 /* RIGHT ANGLE DOTTED SUBSTITUTION MARKER */ + || ch == 0x2E06 /* RAISED INTERPOLATION MARKER */ + || ch == 0x2E07 /* RAISED DOTTED INTERPOLATION MARKER */ + || ch == 0x2E08 /* DOTTED TRANSPOSITION MARKER */ + || ch == 0x2E0B /* RAISED SQUARE */) + attr |= (int64_t) 1 << LBP_QU; + + /* infix separator (numeric) */ + if (ch == 0x002C /* COMMA */ + || ch == 0x002E /* FULL STOP */ + || ch == 0x003A /* COLON */ + || ch == 0x003B /* SEMICOLON */ + || ch == 0x037E /* GREEK QUESTION MARK */ + || ch == 0x0589 /* ARMENIAN FULL STOP */ + || ch == 0x060C /* ARABIC COMMA */ + || ch == 0x060D /* ARABIC DATE SEPARATOR */ + || ch == 0x07F8 /* NKO COMMA */ + || ch == 0x2044 /* FRACTION SLASH */ + || ch == 0xFE10 /* PRESENTATION FORM FOR VERTICAL COMMA */ + || ch == 0xFE13 /* PRESENTATION FORM FOR VERTICAL COLON */ + || ch == 0xFE14 /* PRESENTATION FORM FOR VERTICAL SEMICOLON */) + attr |= (int64_t) 1 << LBP_IS; + + /* numeric */ + if ((unicode_attributes[ch].category[0] == 'N' + && unicode_attributes[ch].category[1] == 'd' + && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) + || ch == 0x066B /* ARABIC DECIMAL SEPARATOR */ + || ch == 0x066C /* ARABIC THOUSANDS SEPARATOR */) + attr |= (int64_t) 1 << LBP_NU; + + /* postfix (numeric) */ + if (ch == 0x0025 /* PERCENT SIGN */ + || ch == 0x00A2 /* CENT SIGN */ + || ch == 0x00B0 /* DEGREE SIGN */ + || ch == 0x060B /* AFGHANI SIGN */ + || ch == 0x066A /* ARABIC PERCENT SIGN */ + || ch == 0x2030 /* PER MILLE SIGN */ + || ch == 0x2031 /* PER TEN THOUSAND SIGN */ + || ch == 0x2032 /* PRIME */ + || ch == 0x2033 /* DOUBLE PRIME */ + || ch == 0x2034 /* TRIPLE PRIME */ + || ch == 0x2035 /* REVERSED PRIME */ + || ch == 0x2036 /* REVERSED DOUBLE PRIME */ + || ch == 0x2037 /* REVERSED TRIPLE PRIME */ + || ch == 0x20A7 /* PESETA SIGN */ + || ch == 0x2103 /* DEGREE CELSIUS */ + || ch == 0x2109 /* DEGREE FAHRENHEIT */ + || ch == 0xFDFC /* RIAL SIGN */ + || ch == 0xFE6A /* SMALL PERCENT SIGN */ + || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ + || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x0609 /* ARABIC-INDIC PER MILLE SIGN */ + || ch == 0x060A /* ARABIC-INDIC PER TEN THOUSAND SIGN */ + || ch == 0x09F2 /* BENGALI RUPEE MARK */ + || ch == 0x09F3 /* BENGALI RUPEE SIGN */ + || ch == 0x09F9 /* BENGALI CURRENCY DENOMINATOR SIXTEEN */ + || ch == 0x0D79 /* MALAYALAM DATE MARK */ + || ch == 0x20B6 /* LIVRE TOURNOIS SIGN */ + || ch == 0xA838 /* NORTH INDIC RUPEE MARK */) + attr |= (int64_t) 1 << LBP_PO; + + /* prefix (numeric) */ + if ((unicode_attributes[ch].category[0] == 'S' + && unicode_attributes[ch].category[1] == 'c') + || ch == 0x002B /* PLUS SIGN */ + || ch == 0x005C /* REVERSE SOLIDUS */ + || ch == 0x00B1 /* PLUS-MINUS SIGN */ + || ch == 0x2116 /* NUMERO SIGN */ + || ch == 0x2212 /* MINUS SIGN */ + || ch == 0x2213 /* MINUS-OR-PLUS SIGN */) + if (!(attr & ((int64_t) 1 << LBP_PO))) + attr |= (int64_t) 1 << LBP_PR; + + /* symbols allowing breaks */ + if (ch == 0x002F /* SOLIDUS */) + attr |= (int64_t) 1 << LBP_SY; + + if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) == 0) + attr |= (int64_t) 1 << LBP_H2; + + if (ch >= 0xAC00 && ch <= 0xD7A3 && ((ch - 0xAC00) % 28) != 0) + attr |= (int64_t) 1 << LBP_H3; + + if ((ch >= 0x1100 && ch <= 0x115F) || (ch >= 0xA960 && ch <= 0xA97C)) + attr |= (int64_t) 1 << LBP_JL; + + if ((ch >= 0x1160 && ch <= 0x11A7) || (ch >= 0xD7B0 && ch <= 0xD7C6)) + attr |= (int64_t) 1 << LBP_JV; + + if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB)) + attr |= (int64_t) 1 << LBP_JT; + + /* complex context (South East Asian) */ + if (((unicode_attributes[ch].category[0] == 'C' + && unicode_attributes[ch].category[1] == 'f') + || (unicode_attributes[ch].category[0] == 'L' + && (unicode_attributes[ch].category[1] == 'm' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'M' + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'n') + && ch != 0x1A7F /* TAI THAM COMBINING CRYPTOGRAMMIC DOT */) + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x109E /* MYANMAR SYMBOL SHAN ONE */ + || ch == 0x109F /* MYANMAR SYMBOL SHAN EXCLAMATION */ + || ch == 0x19DA /* NEW TAI LUE THAM DIGIT ONE */ + || ch == 0x19DE /* NEW TAI LUE SIGN LAE */ + || ch == 0x19DF /* NEW TAI LUE SIGN LAEV */ + || (ch >= 0x1AA0 && ch <= 0x1AAD) /* TAI THAM SIGN */ + || (ch >= 0xAA77 && ch <= 0xAA79) /* MYANMAR SYMBOL AITON */ + || (ch >= 0xAADE && ch <= 0xAADF) /* TAI VIET SYMBOL */) + && ((ch >= 0x0E00 && ch <= 0x0EFF) /* Thai, Lao */ + || (ch >= 0x1000 && ch <= 0x109F) /* Myanmar */ + || (ch >= 0x1780 && ch <= 0x17FF) /* Khmer */ + || (ch >= 0x1950 && ch <= 0x19DF) /* Tai Le, New Tai Lue */ + || (ch >= 0x1A20 && ch <= 0x1AAF) /* Tai Tham */ + || (ch >= 0xAA60 && ch <= 0xAADF) /* Myanmar Extended-A, Tai Viet */)) + attr |= (int64_t) 1 << LBP_SA; + + /* attached characters and combining marks */ + if ((unicode_attributes[ch].category[0] == 'M' + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'e' + || unicode_attributes[ch].category[1] == 'n')) + || (unicode_attributes[ch].category[0] == 'C' + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'f') + && ch != 0x110BD /* KAITHI NUMBER SIGN */)) + if (!(attr & (((int64_t) 1 << LBP_BK) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_WJ) | ((int64_t) 1 << LBP_ZW)))) + attr |= (int64_t) 1 << LBP_CM; + + /* ideographic */ + if ((ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ + || ch == 0x3000 /* IDEOGRAPHIC SPACE */ + || (ch >= 0x3040 && ch <= 0x309F) /* HIRAGANA */ + || (ch >= 0x30A0 && ch <= 0x30FF) /* KATAKANA */ + || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Ideograph */ + || (ch >= 0xF900 && ch <= 0xFAD9) /* CJK COMPATIBILITY IDEOGRAPH */ + || (ch >= 0xA000 && ch <= 0xA48F) /* YI SYLLABLE */ + || (ch >= 0xA490 && ch <= 0xA4CF) /* YI RADICAL */ + || ch == 0xFE62 /* SMALL PLUS SIGN */ + || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ + || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ + || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ + || ch == 0xFE66 /* SMALL EQUALS SIGN */ + || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ + || (ch >= 0x20000 && ch <= 0x2A6D6) /* CJK Ideograph Extension B */ + || (ch >= 0x2F800 && ch <= 0x2FA1D) /* CJK COMPATIBILITY IDEOGRAPH */ + || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL + || (ch >= 0x3000 && ch <= 0x33FF + && !(attr & (((int64_t) 1 << LBP_CM) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP)))) + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0xFE30 /* PRESENTATION FORM FOR VERTICAL TWO DOT LEADER */ + || ch == 0xFE31 /* PRESENTATION FORM FOR VERTICAL EM DASH */ + || ch == 0xFE32 /* PRESENTATION FORM FOR VERTICAL EN DASH */ + || ch == 0xFE33 /* PRESENTATION FORM FOR VERTICAL LOW LINE */ + || ch == 0xFE34 /* PRESENTATION FORM FOR VERTICAL WAVY LOW LINE */ + || ch == 0xFE45 /* SESAME DOT */ + || ch == 0xFE46 /* WHITE SESAME DOT */ + || ch == 0xFE49 /* DASHED OVERLINE */ + || ch == 0xFE4A /* CENTRELINE OVERLINE */ + || ch == 0xFE4B /* WAVY OVERLINE */ + || ch == 0xFE4C /* DOUBLE WAVY OVERLINE */ + || ch == 0xFE4D /* DASHED LOW LINE */ + || ch == 0xFE4E /* CENTRELINE LOW LINE */ + || ch == 0xFE4F /* WAVY LOW LINE */ + || ch == 0xFE51 /* SMALL IDEOGRAPHIC COMMA */ + || ch == 0xFE58 /* SMALL EM DASH */ + || ch == 0xFE5F /* SMALL NUMBER SIGN */ + || ch == 0xFE60 /* SMALL AMPERSAND */ + || ch == 0xFE61 /* SMALL ASTERISK */ + || ch == 0xFE68 /* SMALL REVERSE SOLIDUS */ + || ch == 0xFE6B /* SMALL COMMERCIAL AT */ + || ch == 0xFF02 /* FULLWIDTH QUOTATION MARK */ + || ch == 0xFF03 /* FULLWIDTH NUMBER SIGN */ + || ch == 0xFF06 /* FULLWIDTH AMPERSAND */ + || ch == 0xFF07 /* FULLWIDTH APOSTROPHE */ + || ch == 0xFF0A /* FULLWIDTH ASTERISK */ + || ch == 0xFF0B /* FULLWIDTH PLUS SIGN */ + || ch == 0xFF0D /* FULLWIDTH HYPHEN-MINUS */ + || ch == 0xFF0F /* FULLWIDTH SOLIDUS */ + || ch == 0xFF1C /* FULLWIDTH LESS-THAN SIGN */ + || ch == 0xFF1D /* FULLWIDTH EQUALS SIGN */ + || ch == 0xFF1E /* FULLWIDTH GREATER-THAN SIGN */ + || ch == 0xFF20 /* FULLWIDTH COMMERCIAL AT */ + || ch == 0xFF3C /* FULLWIDTH REVERSE SOLIDUS */ + || ch == 0xFF3E /* FULLWIDTH CIRCUMFLEX ACCENT */ + || ch == 0xFF3F /* FULLWIDTH LOW LINE */ + || ch == 0xFF40 /* FULLWIDTH GRAVE ACCENT */ + || ch == 0xFF5C /* FULLWIDTH VERTICAL LINE */ + || ch == 0xFF5E /* FULLWIDTH TILDE */ + || ch == 0xFFE2 /* FULLWIDTH NOT SIGN */ + || ch == 0xFFE3 /* FULLWIDTH MACRON */ + || ch == 0xFFE4 /* FULLWIDTH BROKEN BAR */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || (ch >= 0x1B000 && ch <= 0x1B001) /* Kana Supplement */ + || (ch >= 0x1F200 && ch <= 0x1F248) /* Enclosed Ideographic Supplement */ + || (ch >= 0x1F250 && ch <= 0x1F251) /* Enclosed Ideographic Supplement */ + || (ch >= 0x2A700 && ch <= 0x2B734) /* CJK Ideograph Extension C */ + || (ch >= 0x2B740 && ch <= 0x2B81D) /* CJK Ideograph Extension D */) + if (!(attr & (((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_CM)))) + { + /* ambiguous (ideograph) ? */ + if ((unicode_width[ch] != NULL + && unicode_width[ch][0] == 'A' + && ch >= 0x2000) + || ch == 0x24EA /* CIRCLED DIGIT ZERO */ + || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */) + attr |= (int64_t) 1 << LBP_AI; + else + attr |= (int64_t) 1 << LBP_ID; + } + + /* ordinary alphabetic and symbol characters */ + if ((unicode_attributes[ch].category[0] == 'L' + && (unicode_attributes[ch].category[1] == 'u' + || unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 't' + || unicode_attributes[ch].category[1] == 'm' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'S' + && (unicode_attributes[ch].category[1] == 'm' + || unicode_attributes[ch].category[1] == 'k' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'N' + && (unicode_attributes[ch].category[1] == 'l' + || unicode_attributes[ch].category[1] == 'o')) + || (unicode_attributes[ch].category[0] == 'P' + && (unicode_attributes[ch].category[1] == 'c' + || unicode_attributes[ch].category[1] == 'd' + || unicode_attributes[ch].category[1] == 'o')) + || ch == 0x0600 /* ARABIC NUMBER SIGN */ + || ch == 0x0601 /* ARABIC SIGN SANAH */ + || ch == 0x0602 /* ARABIC FOOTNOTE MARKER */ + || ch == 0x0603 /* ARABIC SIGN SAFHA */ + || ch == 0x06DD /* ARABIC END OF AYAH */ + || ch == 0x070F /* SYRIAC ABBREVIATION MARK */ + || ch == 0x2061 /* FUNCTION APPLICATION */ + || ch == 0x2062 /* INVISIBLE TIMES */ + || ch == 0x2063 /* INVISIBLE SEPARATOR */ + || ch == 0x2064 /* INVISIBLE PLUS */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x110BD /* KAITHI NUMBER SIGN */) + if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) | ((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) | ((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) | ((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) | ((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) | ((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) | ((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) | ((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID)))) + { + /* ambiguous (alphabetic) ? */ + if ((unicode_width[ch] != NULL + && unicode_width[ch][0] == 'A' + && ch >= 0x2000 + /* Extra exceptions for compatibility with Unicode LineBreak.txt. */ + && ch != 0x2022 /* BULLET */ + && ch != 0x203E /* OVERLINE */ + && ch != 0x2126 /* OHM SIGN */ + && ch != 0x2153 /* VULGAR FRACTION ONE THIRD */ + && ch != 0x215C /* VULGAR FRACTION THREE EIGHTHS */ + && ch != 0x215D /* VULGAR FRACTION FIVE EIGHTHS */ + && ch != 0x21B8 /* NORTH WEST ARROW TO LONG BAR */ + && ch != 0x21B9 /* LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR */ + && ch != 0x21E7 /* UPWARDS WHITE ARROW */ + && ch != 0x24FF /* NEGATIVE CIRCLED DIGIT ZERO */ + && ch != 0x273D /* HEAVY TEARDROP-SPOKED ASTERISK */) + || ch == 0x00A7 /* SECTION SIGN */ + || ch == 0x00A8 /* DIAERESIS */ + || ch == 0x00AA /* FEMININE ORDINAL INDICATOR */ + || ch == 0x00B2 /* SUPERSCRIPT TWO */ + || ch == 0x00B3 /* SUPERSCRIPT THREE */ + || ch == 0x00B6 /* PILCROW SIGN */ + || ch == 0x00B7 /* MIDDLE DOT */ + || ch == 0x00B8 /* CEDILLA */ + || ch == 0x00B9 /* SUPERSCRIPT ONE */ + || ch == 0x00BA /* MASCULINE ORDINAL INDICATOR */ + || ch == 0x00BC /* VULGAR FRACTION ONE QUARTER */ + || ch == 0x00BD /* VULGAR FRACTION ONE HALF */ + || ch == 0x00BE /* VULGAR FRACTION THREE QUARTERS */ + || ch == 0x00D7 /* MULTIPLICATION SIGN */ + || ch == 0x00F7 /* DIVISION SIGN */ + || ch == 0x02C7 /* CARON */ + || ch == 0x02C9 /* MODIFIER LETTER MACRON */ + || ch == 0x02CA /* MODIFIER LETTER ACUTE ACCENT */ + || ch == 0x02CB /* MODIFIER LETTER GRAVE ACCENT */ + || ch == 0x02CD /* MODIFIER LETTER LOW MACRON */ + || ch == 0x02D0 /* MODIFIER LETTER TRIANGULAR COLON */ + || ch == 0x02D8 /* BREVE */ + || ch == 0x02D9 /* DOT ABOVE */ + || ch == 0x02DA /* RING ABOVE */ + || ch == 0x02DB /* OGONEK */ + || ch == 0x02DD /* DOUBLE ACUTE ACCENT */ + || ch == 0x24EA /* CIRCLED DIGIT ZERO */ + || (ch >= 0x2780 && ch <= 0x2793) /* DINGBAT ... CIRCLED DIGIT ... */ + /* Extra characters for compatibility with Unicode LineBreak.txt. */ + || ch == 0x2155 /* VULGAR FRACTION ONE FIFTH */ + || ch == 0x2574 /* BOX DRAWINGS LIGHT LEFT */ + || ch == 0x2616 /* WHITE SHOGI PIECE */ + || ch == 0x2617 /* BLACK SHOGI PIECE */) + attr |= (int64_t) 1 << LBP_AI; + else + attr |= (int64_t) 1 << LBP_AL; + attr &= ~((int64_t) 1 << LBP_CM); + } + } + else + { + /* Unassigned character. */ + if ((ch >= 0x3400 && ch <= 0x4DBF) /* CJK Unified Ideographs Extension A */ + || (ch >= 0x4E00 && ch <= 0x9FFF) /* CJK Unified Ideographs */ + || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK Compatibility Ideographs */ + || (ch >= 0x20000 && ch <= 0x2A6FF) /* CJK Unified Ideographs Extension B */ + || (ch >= 0x2A700 && ch <= 0x2F7FF) /* CJK Unified Ideographs Extension C, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x2F800 && ch <= 0x2FFFD) /* CJK Compatibility Ideographs Supplement, + Supplementary Ideographic Plane (Plane 2) outside of blocks */ + || (ch >= 0x30000 && ch <= 0x3FFFD) /* Tertiary Ideographic Plane (Plane 3) outside of blocks */) + attr |= (int64_t) 1 << LBP_ID; + } + + if (attr == 0) + /* unknown */ + attr |= (int64_t) 1 << LBP_XX; + + return attr; +} + +/* Output the line breaking properties in a human readable format. */ +static void +debug_output_lbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int64_t attr = get_lbp (i); + if (attr != (int64_t) 1 << LBP_XX) + { + fprintf (stream, "0x%04X", i); +#define PRINT_BIT(attr,bit) \ + if (attr & ((int64_t) 1 << bit)) fprintf (stream, " " #bit); + PRINT_BIT(attr,LBP_BK); + PRINT_BIT(attr,LBP_CM); + PRINT_BIT(attr,LBP_WJ); + PRINT_BIT(attr,LBP_ZW); + PRINT_BIT(attr,LBP_GL); + PRINT_BIT(attr,LBP_SP); + PRINT_BIT(attr,LBP_B2); + PRINT_BIT(attr,LBP_BA); + PRINT_BIT(attr,LBP_BB); + PRINT_BIT(attr,LBP_HY); + PRINT_BIT(attr,LBP_CB); + PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); + PRINT_BIT(attr,LBP_EX); + PRINT_BIT(attr,LBP_IN); + PRINT_BIT(attr,LBP_NS); + PRINT_BIT(attr,LBP_OP); + PRINT_BIT(attr,LBP_QU); + PRINT_BIT(attr,LBP_IS); + PRINT_BIT(attr,LBP_NU); + PRINT_BIT(attr,LBP_PO); + PRINT_BIT(attr,LBP_PR); + PRINT_BIT(attr,LBP_SY); + PRINT_BIT(attr,LBP_AI); + PRINT_BIT(attr,LBP_AL); + PRINT_BIT(attr,LBP_H2); + PRINT_BIT(attr,LBP_H3); + PRINT_BIT(attr,LBP_ID); + PRINT_BIT(attr,LBP_JL); + PRINT_BIT(attr,LBP_JV); + PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_SA); + PRINT_BIT(attr,LBP_XX); +#undef PRINT_BIT + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_lbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + debug_output_lbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* The line breaking property from the LineBreak.txt file. */ +int unicode_org_lbp[0x110000]; + +/* Stores in unicode_org_lbp[] the line breaking property from the + LineBreak.txt file. */ +static void +fill_org_lbp (const char *linebreak_filename) +{ + unsigned int i, j; + FILE *stream; + char field0[FIELDLEN]; + char field1[FIELDLEN]; + char field2[FIELDLEN]; + int lineno = 0; + + for (i = 0; i < 0x110000; i++) + unicode_org_lbp[i] = LBP_XX; + + stream = fopen (linebreak_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", linebreak_filename); + exit (1); + } + + for (;;) + { + int n; + int c; + int value; + + lineno++; + c = getc (stream); + if (c == EOF) + break; + if (c == '#') + { + do c = getc (stream); while (c != EOF && c != '\n'); + continue; + } + ungetc (c, stream); + n = getfield (stream, field0, ';'); + n += getfield (stream, field1, ' '); + n += getfield (stream, field2, '\n'); + if (n == 0) + break; + if (n != 3) + { + fprintf (stderr, "short line in '%s':%d\n", linebreak_filename, + lineno); + exit (1); + } +#define TRY(bit) else if (strcmp (field1, #bit + 4) == 0) value = bit; + if (false) {} + TRY(LBP_BK) + TRY(LBP_CM) + TRY(LBP_WJ) + TRY(LBP_ZW) + TRY(LBP_GL) + TRY(LBP_SP) + TRY(LBP_B2) + TRY(LBP_BA) + TRY(LBP_BB) + TRY(LBP_HY) + TRY(LBP_CB) + TRY(LBP_CL) + TRY(LBP_CP) + TRY(LBP_EX) + TRY(LBP_IN) + TRY(LBP_NS) + TRY(LBP_OP) + TRY(LBP_QU) + TRY(LBP_IS) + TRY(LBP_NU) + TRY(LBP_PO) + TRY(LBP_PR) + TRY(LBP_SY) + TRY(LBP_AI) + TRY(LBP_AL) + TRY(LBP_H2) + TRY(LBP_H3) + TRY(LBP_ID) + TRY(LBP_JL) + TRY(LBP_JV) + TRY(LBP_JT) + TRY(LBP_SA) + TRY(LBP_XX) +#undef TRY + else if (strcmp (field1, "LF") == 0) value = LBP_BK; + else if (strcmp (field1, "CR") == 0) value = LBP_BK; + else if (strcmp (field1, "NL") == 0) value = LBP_BK; + else if (strcmp (field1, "SG") == 0) value = LBP_XX; + else + { + fprintf (stderr, "unknown property value \"%s\" in '%s':%d\n", + field1, linebreak_filename, lineno); + exit (1); + } + i = strtoul (field0, NULL, 16); + if (strstr (field0, "..") != NULL) + { + /* Deal with a range. */ + j = strtoul (strstr (field0, "..") + 2, NULL, 16); + for (; i <= j; i++) + unicode_org_lbp[i] = value; + } + else + { + /* Single character line. */ + unicode_org_lbp[i] = value; + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", linebreak_filename); + exit (1); + } +} + +/* Output the line breaking properties in a human readable format. */ +static void +debug_output_org_lbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int attr = unicode_org_lbp[i]; + if (attr != LBP_XX) + { + fprintf (stream, "0x%04X", i); +#define PRINT_BIT(attr,bit) \ + if (attr == bit) fprintf (stream, " " #bit); + PRINT_BIT(attr,LBP_BK); + PRINT_BIT(attr,LBP_CM); + PRINT_BIT(attr,LBP_WJ); + PRINT_BIT(attr,LBP_ZW); + PRINT_BIT(attr,LBP_GL); + PRINT_BIT(attr,LBP_SP); + PRINT_BIT(attr,LBP_B2); + PRINT_BIT(attr,LBP_BA); + PRINT_BIT(attr,LBP_BB); + PRINT_BIT(attr,LBP_HY); + PRINT_BIT(attr,LBP_CB); + PRINT_BIT(attr,LBP_CL); + PRINT_BIT(attr,LBP_CP); + PRINT_BIT(attr,LBP_EX); + PRINT_BIT(attr,LBP_IN); + PRINT_BIT(attr,LBP_NS); + PRINT_BIT(attr,LBP_OP); + PRINT_BIT(attr,LBP_QU); + PRINT_BIT(attr,LBP_IS); + PRINT_BIT(attr,LBP_NU); + PRINT_BIT(attr,LBP_PO); + PRINT_BIT(attr,LBP_PR); + PRINT_BIT(attr,LBP_SY); + PRINT_BIT(attr,LBP_AI); + PRINT_BIT(attr,LBP_AL); + PRINT_BIT(attr,LBP_H2); + PRINT_BIT(attr,LBP_H3); + PRINT_BIT(attr,LBP_ID); + PRINT_BIT(attr,LBP_JL); + PRINT_BIT(attr,LBP_JV); + PRINT_BIT(attr,LBP_JT); + PRINT_BIT(attr,LBP_SA); + PRINT_BIT(attr,LBP_XX); +#undef PRINT_BIT + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_org_lbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + debug_output_org_lbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE lbp_table +#define ELEMENT unsigned char +#define DEFAULT LBP_XX +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_lbp (FILE *stream1, FILE *stream2) +{ + unsigned int i; + struct lbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + t.p = 7; + t.q = 9; + lbp_table_init (&t); + + for (i = 0; i < 0x110000; i++) + { + int64_t attr = get_lbp (i); + + /* Now attr should contain exactly one bit. */ + if (attr == 0 || ((attr & (attr - 1)) != 0)) + abort (); + + if (attr != (int64_t) 1 << LBP_XX) + { + unsigned int log2_attr; + for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); + + lbp_table_add (&t, i, log2_attr); + } + } + + lbp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream1, "#define lbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream1, "\n"); + fprintf (stream1, "typedef struct\n"); + fprintf (stream1, " {\n"); + fprintf (stream1, " int level1[%zu];\n", t.level1_size); + fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream1, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream1, " }\n"); + fprintf (stream1, "lbrkprop_t;\n"); + fprintf (stream1, "extern const lbrkprop_t unilbrkprop;\n"); + + fprintf (stream2, "const lbrkprop_t unilbrkprop =\n"); + fprintf (stream2, "{\n"); + fprintf (stream2, " {"); + if (t.level1_size > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream2, ","); + } + if (t.level1_size > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); + if (i+1 < t.level2_size << t.q) + fprintf (stream2, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; + const char *value_string; + switch (value) + { +#define CASE(x) case x: value_string = #x; break; + CASE(LBP_BK); + CASE(LBP_CM); + CASE(LBP_WJ); + CASE(LBP_ZW); + CASE(LBP_GL); + CASE(LBP_SP); + CASE(LBP_B2); + CASE(LBP_BA); + CASE(LBP_BB); + CASE(LBP_HY); + CASE(LBP_CB); + CASE(LBP_CL); + CASE(LBP_CP); + CASE(LBP_EX); + CASE(LBP_IN); + CASE(LBP_NS); + CASE(LBP_OP); + CASE(LBP_QU); + CASE(LBP_IS); + CASE(LBP_NU); + CASE(LBP_PO); + CASE(LBP_PR); + CASE(LBP_SY); + CASE(LBP_AI); + CASE(LBP_AL); + CASE(LBP_H2); + CASE(LBP_H3); + CASE(LBP_ID); + CASE(LBP_JL); + CASE(LBP_JV); + CASE(LBP_JT); + CASE(LBP_SA); + CASE(LBP_XX); +#undef CASE + default: + abort (); + } + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + fprintf (stream2, " %s%s", value_string, + (i+1 < t.level3_size << t.p ? "," : "")); + } + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " }\n"); + fprintf (stream2, "};\n"); +} + +static void +output_lbrk_tables (const char *filename1, const char *filename2, const char *version) +{ + const char *filenames[2]; + FILE *streams[2]; + size_t i; + + filenames[0] = filename1; + filenames[1] = filename2; + + for (i = 0; i < 2; i++) + { + streams[i] = fopen (filenames[i], "w"); + if (streams[i] == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); + exit (1); + } + } + + for (i = 0; i < 2; i++) + { + FILE *stream = streams[i]; + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2008 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n"); + fprintf (stream, "\n"); + } + + output_lbp (streams[0], streams[1]); + + for (i = 0; i < 2; i++) + { + if (ferror (streams[i]) || fclose (streams[i])) + { + fprintf (stderr, "error writing to '%s'\n", filenames[i]); + exit (1); + } + } +} + +/* ========================================================================= */ + +/* Word break property. + Updated for Unicode TR #29 revision 17. */ + +/* Possible values of the Word_Break property. */ +enum +{ + WBP_OTHER = 0, + WBP_CR = 11, + WBP_LF = 12, + WBP_NEWLINE = 10, + WBP_EXTEND = 8, + WBP_FORMAT = 9, + WBP_KATAKANA = 1, + WBP_ALETTER = 2, + WBP_MIDNUMLET = 3, + WBP_MIDLETTER = 4, + WBP_MIDNUM = 5, + WBP_NUMERIC = 6, + WBP_EXTENDNUMLET = 7 +}; + +/* Returns the word breaking property for ch, as a bit mask. */ +static int +get_wbp (unsigned int ch) +{ + int attr = 0; + + if (unicode_attributes[ch].name != NULL) + { + if (ch == 0x000D) + attr |= 1 << WBP_CR; + + if (ch == 0x000A) + attr |= 1 << WBP_LF; + + if (ch == 0x000B || ch == 0x000C + || ch == 0x0085 + || ch == 0x2028 || ch == 0x2029) + attr |= 1 << WBP_NEWLINE; + + if (((unicode_properties[ch] >> PROP_GRAPHEME_EXTEND) & 1) != 0 + || (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Mc") == 0)) + attr |= 1 << WBP_EXTEND; + + if (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Cf") == 0 + && ch != 0x200B && ch != 0x200C && ch != 0x200D) + attr |= 1 << WBP_FORMAT; + + if ((unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Katakana") == 0) + || (ch >= 0x3031 && ch <= 0x3035) + || ch == 0x309B || ch == 0x309C || ch == 0x30A0 || ch == 0x30FC + || ch == 0xFF70) + attr |= 1 << WBP_KATAKANA; + + if ((((unicode_properties[ch] >> PROP_ALPHABETIC) & 1) != 0 + || ch == 0x05F3) + && ((unicode_properties[ch] >> PROP_IDEOGRAPHIC) & 1) == 0 + && (attr & (1 << WBP_KATAKANA)) == 0 + && ((get_lbp (ch) >> LBP_SA) & 1) == 0 + && !(unicode_scripts[ch] < numscripts + && strcmp (scripts[unicode_scripts[ch]], "Hiragana") == 0) + && (attr & (1 << WBP_EXTEND)) == 0) + attr |= 1 << WBP_ALETTER; + + if (is_WBP_MIDNUMLET (ch)) + attr |= 1 << WBP_MIDNUMLET; + + if (is_WBP_MIDLETTER (ch)) + attr |= 1 << WBP_MIDLETTER; + + if ((((get_lbp (ch) >> LBP_IS) & 1) != 0 + || ch == 0x066C || ch == 0xFE50 || ch == 0xFE54 || ch == 0xFF0C + || ch == 0xFF1B) + && ch != 0x003A && ch != 0xFE13 && ch != 0x002E) + attr |= 1 << WBP_MIDNUM; + + if (((get_lbp (ch) >> LBP_NU) & 1) != 0 + && ch != 0x066C) + attr |= 1 << WBP_NUMERIC; + + if (unicode_attributes[ch].category != NULL + && strcmp (unicode_attributes[ch].category, "Pc") == 0) + attr |= 1 << WBP_EXTENDNUMLET; + } + + if (attr == 0) + /* other */ + attr |= 1 << WBP_OTHER; + + return attr; +} + +/* Output the word break property in a human readable format. */ +static void +debug_output_wbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int attr = get_wbp (i); + if (attr != 1 << WBP_OTHER) + { + fprintf (stream, "0x%04X", i); + if (attr & (1 << WBP_CR)) + fprintf (stream, " CR"); + if (attr & (1 << WBP_LF)) + fprintf (stream, " LF"); + if (attr & (1 << WBP_NEWLINE)) + fprintf (stream, " Newline"); + if (attr & (1 << WBP_EXTEND)) + fprintf (stream, " Extend"); + if (attr & (1 << WBP_FORMAT)) + fprintf (stream, " Format"); + if (attr & (1 << WBP_KATAKANA)) + fprintf (stream, " Katakana"); + if (attr & (1 << WBP_ALETTER)) + fprintf (stream, " ALetter"); + if (attr & (1 << WBP_MIDNUMLET)) + fprintf (stream, " MidNumLet"); + if (attr & (1 << WBP_MIDLETTER)) + fprintf (stream, " MidLetter"); + if (attr & (1 << WBP_MIDNUM)) + fprintf (stream, " MidNum"); + if (attr & (1 << WBP_NUMERIC)) + fprintf (stream, " Numeric"); + if (attr & (1 << WBP_EXTENDNUMLET)) + fprintf (stream, " ExtendNumLet"); + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_wbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + debug_output_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* The word break property from the WordBreakProperty.txt file. */ +int unicode_org_wbp[0x110000]; + +/* Stores in unicode_org_wbp[] the word break property from the + WordBreakProperty.txt file. */ +static void +fill_org_wbp (const char *wordbreakproperty_filename) +{ + unsigned int i; + FILE *stream; + + for (i = 0; i < 0x110000; i++) + unicode_org_wbp[i] = WBP_OTHER; + + stream = fopen (wordbreakproperty_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", wordbreakproperty_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + int propvalue; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + wordbreakproperty_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + PROP ("CR", WBP_CR) + PROP ("LF", WBP_LF) + PROP ("Newline", WBP_NEWLINE) + PROP ("Extend", WBP_EXTEND) + PROP ("Format", WBP_FORMAT) + PROP ("Katakana", WBP_KATAKANA) + PROP ("ALetter", WBP_ALETTER) + PROP ("MidNumLet", WBP_MIDNUMLET) + PROP ("MidLetter", WBP_MIDLETTER) + PROP ("MidNum", WBP_MIDNUM) + PROP ("Numeric", WBP_NUMERIC) + PROP ("ExtendNumLet", WBP_EXTENDNUMLET) +#undef PROP + { + fprintf (stderr, "unknown property value '%s' in '%s'\n", propname, + wordbreakproperty_filename); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_org_wbp[i] = propvalue; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", wordbreakproperty_filename); + exit (1); + } +} + +/* Output the word break property in a human readable format. */ +static void +debug_output_org_wbp (FILE *stream) +{ + unsigned int i; + + for (i = 0; i < 0x110000; i++) + { + int propvalue = unicode_org_wbp[i]; + if (propvalue != WBP_OTHER) + { + fprintf (stream, "0x%04X", i); +#define PROP(name,value) \ + if (propvalue == value) fprintf (stream, " " name); else + PROP ("CR", WBP_CR) + PROP ("LF", WBP_LF) + PROP ("Newline", WBP_NEWLINE) + PROP ("Extend", WBP_EXTEND) + PROP ("Format", WBP_FORMAT) + PROP ("Katakana", WBP_KATAKANA) + PROP ("ALetter", WBP_ALETTER) + PROP ("MidNumLet", WBP_MIDNUMLET) + PROP ("MidLetter", WBP_MIDLETTER) + PROP ("MidNum", WBP_MIDNUM) + PROP ("Numeric", WBP_NUMERIC) + PROP ("ExtendNumLet", WBP_EXTENDNUMLET) +#undef PROP + fprintf (stream, " ??"); + fprintf (stream, "\n"); + } + } +} + +static void +debug_output_org_wbrk_tables (const char *filename) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + debug_output_org_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE wbp_table +#define ELEMENT unsigned char +#define DEFAULT WBP_OTHER +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_wbp (FILE *stream) +{ + unsigned int i; + struct wbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + t.p = 7; + t.q = 9; + wbp_table_init (&t); + + for (i = 0; i < 0x110000; i++) + { + int attr = get_wbp (i); + + /* Now attr should contain exactly one bit. */ + if (attr == 0 || ((attr & (attr - 1)) != 0)) + abort (); + + if (attr != 1 << WBP_OTHER) + { + unsigned int log2_attr; + for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); + + wbp_table_add (&t, i, log2_attr); + } + } + + wbp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define wbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "\n"); + fprintf (stream, "typedef struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "wbrkprop_t;\n"); + fprintf (stream, "static const wbrkprop_t uniwbrkprop =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (unsigned char)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; + const char *value_string; + switch (value) + { +#define CASE(x) case x: value_string = #x; break; + CASE(WBP_OTHER); + CASE(WBP_CR); + CASE(WBP_LF); + CASE(WBP_NEWLINE); + CASE(WBP_EXTEND); + CASE(WBP_FORMAT); + CASE(WBP_KATAKANA); + CASE(WBP_ALETTER); + CASE(WBP_MIDNUMLET); + CASE(WBP_MIDLETTER); + CASE(WBP_MIDNUM); + CASE(WBP_NUMERIC); + CASE(WBP_EXTENDNUMLET); +#undef CASE + default: + abort (); + } + if (i > 0 && (i % 4) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %s%s", value_string, + (i+1 < t.level3_size << t.p ? "," : "")); + } + if (t.level3_size << t.p > 4) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); +} + +static void +output_wbrk_tables (const char *filename, const char *version) +{ + FILE *stream; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2000-2002, 2004, 2007-2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n"); + fprintf (stream, "\n"); + + output_wbp (stream); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Grapheme break property. + Updated for Unicode TR #29 revision 17. */ + +/* Possible values of the Grapheme_Cluster_Break property. */ +enum +{ + GBP_OTHER = 0, + GBP_CR = 1, + GBP_LF = 2, + GBP_CONTROL = 3, + GBP_EXTEND = 4, + GBP_PREPEND = 5, + GBP_SPACINGMARK = 6, + GBP_L = 7, + GBP_V = 8, + GBP_T = 9, + GBP_LV = 10, + GBP_LVT = 11 +}; + +/* Construction of sparse 3-level tables. */ +#define TABLE gbp_table +#define ELEMENT unsigned char +#define DEFAULT GBP_OTHER +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* The grapheme break property from the GraphemeBreakProperty.txt file. */ +int unicode_org_gbp[0x110000]; + +/* Output the unit test data for the grapheme break property. */ +static void +output_gbp_test (const char *filename) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Test the Unicode grapheme break property functions.\n"); + fprintf (stream, " Copyright (C) 2010 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n"); + fprintf (stream, "\n"); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + int gbp = unicode_org_gbp[ch]; + const char *gbp_string; + + while (ch + 1 < 0x110000 && unicode_org_gbp[ch + 1] == gbp) + ch++; + + switch (gbp) + { +#define CASE(x) case x: gbp_string = #x; break; + CASE (GBP_OTHER) + CASE (GBP_CR) + CASE (GBP_LF) + CASE (GBP_CONTROL) + CASE (GBP_EXTEND) + CASE (GBP_PREPEND) + CASE (GBP_SPACINGMARK) + CASE (GBP_L) + CASE (GBP_V) + CASE (GBP_T) + CASE (GBP_LV) + CASE (GBP_LVT) +#undef CASE + default: + abort (); + } + + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, "{ 0x%04X, %s }", ch + 1, gbp_string); + + need_comma = true; + } + fprintf (stream, "\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Output the per-character grapheme break property table. */ +static void +output_gbp_table (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct gbp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Grapheme break property of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + gbp_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + gbp_table_add (&t, ch, unicode_org_gbp[ch]); + + gbp_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define gbrkprop_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " unsigned char level3[(%zu << %d) / 2];\n", + t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "unigbrkprop =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (uint8_t) / 2); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < (t.level3_size << t.p) / 2; i++) + { + unsigned char *p = (unsigned char *) (t.result + level3_offset); + unsigned char value0 = p[i * 2]; + unsigned char value1 = p[i * 2 + 1]; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " 0x%02x%s", (value1 << 4) + value0, + (i+1 < (t.level3_size << t.p) / 2 ? "," : "")); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Stores in unicode_org_gbp[] the grapheme breaking property from the + GraphemeBreakProperty.txt file. */ +static void +fill_org_gbp (const char *graphemebreakproperty_filename) +{ + unsigned int i; + FILE *stream; + int lineno = 0; + + for (i = 0; i < 0x110000; i++) + unicode_org_gbp[i] = GBP_OTHER; + + stream = fopen (graphemebreakproperty_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", + graphemebreakproperty_filename); + exit (1); + } + + for (;;) + { + char buf[200+1]; + unsigned int i1, i2; + char padding[200+1]; + char propname[200+1]; + int propvalue; + + lineno++; + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X..%X%[ ;]%[^ ]", &i1, &i2, padding, propname) != 4) + { + if (sscanf (buf, "%X%[ ;]%[^ ]", &i1, padding, propname) != 3) + { + fprintf (stderr, "parse error in '%s'\n", + graphemebreakproperty_filename); + exit (1); + } + i2 = i1; + } +#define PROP(name,value) \ + if (strcmp (propname, name) == 0) propvalue = value; else + PROP ("CR", GBP_CR) + PROP ("LF", GBP_LF) + PROP ("Control", GBP_CONTROL) + PROP ("Extend", GBP_EXTEND) + PROP ("Prepend", GBP_PREPEND) + PROP ("SpacingMark", GBP_SPACINGMARK) + PROP ("L", GBP_L) + PROP ("V", GBP_V) + PROP ("T", GBP_T) + PROP ("LV", GBP_LV) + PROP ("LVT", GBP_LVT) +#undef PROP + { + fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname, + graphemebreakproperty_filename, lineno); + exit (1); + } + if (!(i1 <= i2 && i2 < 0x110000)) + abort (); + + for (i = i1; i <= i2; i++) + unicode_org_gbp[i] = propvalue; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", graphemebreakproperty_filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Composition and decomposition. + Updated for Unicode TR #15 revision 33. */ + +/* Maximum number of characters into which a single Unicode character can be + decomposed. */ +#define MAX_DECOMP_LENGTH 18 + +enum +{ + UC_DECOMP_CANONICAL,/* Canonical decomposition. */ + UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */ + UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */ + UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */ + UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */ + UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */ + UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */ + UC_DECOMP_CIRCLE, /* <circle> An encircled form. */ + UC_DECOMP_SUPER, /* <super> A superscript form. */ + UC_DECOMP_SUB, /* <sub> A subscript form. */ + UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */ + UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */ + UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */ + UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */ + UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */ + UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */ + UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */ +}; + +/* Return the decomposition for a Unicode character (ignoring Hangul Jamo + decompositions). Return the type, or -1 for none. */ +static int +get_decomposition (unsigned int ch, + unsigned int *lengthp, unsigned int decomposed[MAX_DECOMP_LENGTH]) +{ + const char *decomposition = unicode_attributes[ch].decomposition; + + if (decomposition != NULL && decomposition[0] != '\0') + { + int type = UC_DECOMP_CANONICAL; + unsigned int length; + char *endptr; + + if (decomposition[0] == '<') + { + const char *rangle; + size_t typelen; + + rangle = strchr (decomposition + 1, '>'); + if (rangle == NULL) + abort (); + typelen = rangle + 1 - decomposition; +#define TYPE(t1,t2) \ + if (typelen == (sizeof (t1) - 1) && memcmp (decomposition, t1, typelen) == 0) \ + type = t2; \ + else + TYPE ("<font>", UC_DECOMP_FONT) + TYPE ("<noBreak>", UC_DECOMP_NOBREAK) + TYPE ("<initial>", UC_DECOMP_INITIAL) + TYPE ("<medial>", UC_DECOMP_MEDIAL) + TYPE ("<final>", UC_DECOMP_FINAL) + TYPE ("<isolated>", UC_DECOMP_ISOLATED) + TYPE ("<circle>", UC_DECOMP_CIRCLE) + TYPE ("<super>", UC_DECOMP_SUPER) + TYPE ("<sub>", UC_DECOMP_SUB) + TYPE ("<vertical>", UC_DECOMP_VERTICAL) + TYPE ("<wide>", UC_DECOMP_WIDE) + TYPE ("<narrow>", UC_DECOMP_NARROW) + TYPE ("<small>", UC_DECOMP_SMALL) + TYPE ("<square>", UC_DECOMP_SQUARE) + TYPE ("<fraction>", UC_DECOMP_FRACTION) + TYPE ("<compat>", UC_DECOMP_COMPAT) + { + fprintf (stderr, "unknown decomposition type %*s\n", (int)typelen, decomposition); + exit (1); + } +#undef TYPE + decomposition = rangle + 1; + if (decomposition[0] == ' ') + decomposition++; + } + for (length = 0; length < MAX_DECOMP_LENGTH; length++) + { + decomposed[length] = strtoul (decomposition, &endptr, 16); + if (endptr == decomposition) + break; + decomposition = endptr; + if (decomposition[0] == ' ') + decomposition++; + } + if (*decomposition != '\0') + /* MAX_DECOMP_LENGTH is too small. */ + abort (); + + *lengthp = length; + return type; + } + else + return -1; +} + +/* Construction of sparse 3-level tables. */ +#define TABLE decomp_table +#define ELEMENT uint16_t +#define DEFAULT (uint16_t)(-1) +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +static void +output_decomposition (FILE *stream1, FILE *stream2) +{ + struct decomp_table t; + unsigned int level1_offset, level2_offset, level3_offset; + unsigned int offset; + unsigned int ch; + unsigned int i; + + t.p = 5; + t.q = 5; + decomp_table_init (&t); + + fprintf (stream1, "extern const unsigned char gl_uninorm_decomp_chars_table[];\n"); + fprintf (stream1, "\n"); + fprintf (stream2, "const unsigned char gl_uninorm_decomp_chars_table[] =\n{"); + offset = 0; + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type >= 0) + { + if (!(offset < (1 << 15))) + abort (); + decomp_table_add (&t, ch, ((type == UC_DECOMP_CANONICAL ? 0 : 1) << 15) | offset); + + /* Produce length 3-bytes entries. */ + if (length == 0) + /* We would need a special representation of zero-length entries. */ + abort (); + for (i = 0; i < length; i++) + { + if (offset > 0) + fprintf (stream2, ","); + if ((offset % 4) == 0) + fprintf (stream2, "\n "); + if (!(decomposed[i] < (1 << 18))) + abort (); + fprintf (stream2, " 0x%02X, 0x%02X, 0x%02X", + (((i+1 < length ? (1 << 23) : 0) + | (i == 0 ? (type << 18) : 0) + | decomposed[i]) >> 16) & 0xff, + (decomposed[i] >> 8) & 0xff, + decomposed[i] & 0xff); + offset++; + } + } + } + + fprintf (stream2, "\n};\n"); + fprintf (stream2, "\n"); + + decomp_table_finalize (&t); + + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream1, "#define decomp_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream1, "\n"); + fprintf (stream1, "typedef struct\n"); + fprintf (stream1, " {\n"); + fprintf (stream1, " int level1[%zu];\n", t.level1_size); + fprintf (stream1, " int level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream1, " unsigned short level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream1, " }\n"); + fprintf (stream1, "decomp_index_table_t;\n"); + fprintf (stream1, "extern const decomp_index_table_t gl_uninorm_decomp_index_table;\n"); + fprintf (stream2, "const decomp_index_table_t gl_uninorm_decomp_index_table =\n"); + fprintf (stream2, "{\n"); + fprintf (stream2, " {"); + if (t.level1_size > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream2, ","); + } + if (t.level1_size > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream2, " %5d", -1); + else + fprintf (stream2, " %5zu", + (offset - level3_offset) / sizeof (uint16_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream2, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " },\n"); + fprintf (stream2, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + uint16_t value = ((uint16_t *) (t.result + level3_offset))[i]; + if (i > 0 && (i % 8) == 0) + fprintf (stream2, "\n "); + fprintf (stream2, " %5d", value == (uint16_t)(-1) ? -1 : value); + if (i+1 < t.level3_size << t.p) + fprintf (stream2, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream2, "\n "); + fprintf (stream2, " }\n"); + fprintf (stream2, "};\n"); +} + +static void +output_decomposition_tables (const char *filename1, const char *filename2, const char *version) +{ + const char *filenames[2]; + FILE *streams[2]; + size_t i; + + filenames[0] = filename1; + filenames[1] = filename2; + + for (i = 0; i < 2; i++) + { + streams[i] = fopen (filenames[i], "w"); + if (streams[i] == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filenames[i]); + exit (1); + } + } + + for (i = 0; i < 2; i++) + { + FILE *stream = streams[i]; + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Decomposition of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + } + + output_decomposition (streams[0], streams[1]); + + for (i = 0; i < 2; i++) + { + if (ferror (streams[i]) || fclose (streams[i])) + { + fprintf (stderr, "error writing to '%s'\n", filenames[i]); + exit (1); + } + } +} + +/* The "excluded from composition" property from the CompositionExclusions.txt file. */ +char unicode_composition_exclusions[0x110000]; + +static void +fill_composition_exclusions (const char *compositionexclusions_filename) +{ + FILE *stream; + unsigned int i; + + stream = fopen (compositionexclusions_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", compositionexclusions_filename); + exit (1); + } + + for (i = 0; i < 0x110000; i++) + unicode_composition_exclusions[i] = 0; + + for (;;) + { + char buf[200+1]; + unsigned int i; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + if (sscanf (buf, "%X", &i) != 1) + { + fprintf (stderr, "parse error in '%s'\n", compositionexclusions_filename); + exit (1); + } + if (!(i < 0x110000)) + abort (); + + unicode_composition_exclusions[i] = 1; + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", compositionexclusions_filename); + exit (1); + } +} + +static void +debug_output_composition_tables (const char *filename) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type == UC_DECOMP_CANONICAL + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + fprintf (stream, "0x%04X\t0x%04X\t0x%04X\t%s\n", + code1, + code2, + combined, + unicode_attributes[code2].combining); + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +static void +output_composition_tables (const char *filename, const char *version) +{ + FILE *stream; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Canonical composition of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + + /* Put a GPL header on it. The gnulib module is under LGPL (although it + still carries the GPL header), and it's gnulib-tool which replaces the + GPL header with an LGPL header. */ + fprintf (stream, "/* Copyright (C) 2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n"); + fprintf (stream, "\n"); + + /* The composition table is a set of mappings (code1, code2) -> combined, + with 928 entries, + 367 values for code1 (from 0x003C to 0x30FD), + 54 values for code2 (from 0x0300 to 0x309A). + For a fixed code1, there are from 1 to 19 possible values for code2. + For a fixed code2, there are from 1 to 117 possible values for code1. + This is a very sparse matrix. + + We want an O(1) hash lookup. + + We could implement the hash lookup by mapping (code1, code2) to a linear + combination mul1*code1 + mul2*code2, which is then used as an index into + a 3-level table. But this leads to a table of size 37 KB. + + We use gperf to implement the hash lookup, giving it the 928 sets of + 4 bytes (code1, code2) as input. gperf generates a hash table of size + 1527, which is quite good (60% filled). It requires an auxiliary table + lookup in a table of size 0.5 KB. The total tables size is 11 KB. */ + + fprintf (stream, "struct composition_rule { char codes[6]; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define slot-name codes\n"); + fprintf (stream, "%%define hash-function-name gl_uninorm_compose_hash\n"); + fprintf (stream, "%%define lookup-function-name gl_uninorm_compose_lookup\n"); + fprintf (stream, "%%compare-lengths\n"); + fprintf (stream, "%%compare-strncmp\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%omit-struct-type\n"); + fprintf (stream, "%%%%\n"); + + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int length; + unsigned int decomposed[MAX_DECOMP_LENGTH]; + int type = get_decomposition (ch, &length, decomposed); + + if (type == UC_DECOMP_CANONICAL + /* Consider only binary decompositions. + Exclude singleton decompositions. */ + && length == 2) + { + unsigned int code1 = decomposed[0]; + unsigned int code2 = decomposed[1]; + unsigned int combined = ch; + + /* Exclude decompositions where the first part is not a starter, + i.e. is not of canonical combining class 0. */ + if (strcmp (unicode_attributes[code1].combining, "0") == 0 + /* Exclude characters listed in CompositionExclusions.txt. */ + && !unicode_composition_exclusions[combined]) + { + /* The combined character must now also be a starter. + Verify this. */ + if (strcmp (unicode_attributes[combined].combining, "0") != 0) + abort (); + + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\\x%02x\", 0x%04x\n", + (code1 >> 16) & 0xff, (code1 >> 8) & 0xff, code1 & 0xff, + (code2 >> 16) & 0xff, (code2 >> 8) & 0xff, code2 & 0xff, + combined); + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Output the test for a simple character mapping table to the given file. */ + +static void +output_simple_mapping_test (const char *filename, + const char *function_name, + unsigned int (*func) (unsigned int), + const char *version) +{ + FILE *stream; + bool need_comma; + unsigned int ch; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Test the Unicode character mapping functions.\n"); + fprintf (stream, " Copyright (C) 2009 Free Software Foundation, Inc.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is free software: you can redistribute it and/or modify\n"); + fprintf (stream, " it under the terms of the GNU General Public License as published by\n"); + fprintf (stream, " the Free Software Foundation; either version 3 of the License, or\n"); + fprintf (stream, " (at your option) any later version.\n"); + fprintf (stream, "\n"); + fprintf (stream, " This program is distributed in the hope that it will be useful,\n"); + fprintf (stream, " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"); + fprintf (stream, " MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"); + fprintf (stream, " GNU General Public License for more details.\n"); + fprintf (stream, "\n"); + fprintf (stream, " You should have received a copy of the GNU General Public License\n"); + fprintf (stream, " along with this program. If not, see <http://www.gnu.org/licenses/>. */\n"); + fprintf (stream, "\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "\n"); + fprintf (stream, "#include \"test-mapping-part1.h\"\n"); + fprintf (stream, "\n"); + + need_comma = false; + for (ch = 0; ch < 0x110000; ch++) + { + unsigned int value = func (ch); + + if (value != ch) + { + if (need_comma) + fprintf (stream, ",\n"); + fprintf (stream, " { 0x%04X, 0x%04X }", ch, value); + need_comma = true; + } + } + if (need_comma) + fprintf (stream, "\n"); + + fprintf (stream, "\n"); + fprintf (stream, "#define MAP(c) %s (c)\n", function_name); + fprintf (stream, "#include \"test-mapping-part2.h\"\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* Construction of sparse 3-level tables. */ +#define TABLE mapping_table +#define ELEMENT int32_t +#define DEFAULT 0 +#define xmalloc malloc +#define xrealloc realloc +#include "3level.h" + +/* Output a simple character mapping table to the given file. */ + +static void +output_simple_mapping (const char *filename, + unsigned int (*func) (unsigned int), + const char *version) +{ + FILE *stream; + unsigned int ch, i; + struct mapping_table t; + unsigned int level1_offset, level2_offset, level3_offset; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Simple character mapping of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + + t.p = 7; + t.q = 9; + mapping_table_init (&t); + + for (ch = 0; ch < 0x110000; ch++) + { + int value = (int) func (ch) - (int) ch; + + mapping_table_add (&t, ch, value); + } + + mapping_table_finalize (&t); + + /* Offsets in t.result, in memory of this process. */ + level1_offset = + 5 * sizeof (uint32_t); + level2_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t); + level3_offset = + 5 * sizeof (uint32_t) + + t.level1_size * sizeof (uint32_t) + + (t.level2_size << t.q) * sizeof (uint32_t); + + for (i = 0; i < 5; i++) + fprintf (stream, "#define mapping_header_%d %d\n", i, + ((uint32_t *) t.result)[i]); + fprintf (stream, "static const\n"); + fprintf (stream, "struct\n"); + fprintf (stream, " {\n"); + fprintf (stream, " int level1[%zu];\n", t.level1_size); + fprintf (stream, " short level2[%zu << %d];\n", t.level2_size, t.q); + fprintf (stream, " int level3[%zu << %d];\n", t.level3_size, t.p); + fprintf (stream, " }\n"); + fprintf (stream, "u_mapping =\n"); + fprintf (stream, "{\n"); + fprintf (stream, " {"); + if (t.level1_size > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level1_size; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level1_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level2_offset) / sizeof (uint32_t)); + if (i+1 < t.level1_size) + fprintf (stream, ","); + } + if (t.level1_size > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level2_size << t.q; i++) + { + uint32_t offset; + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + offset = ((uint32_t *) (t.result + level2_offset))[i]; + if (offset == 0) + fprintf (stream, " %5d", -1); + else + fprintf (stream, " %5zu", + (offset - level3_offset) / sizeof (int32_t)); + if (i+1 < t.level2_size << t.q) + fprintf (stream, ","); + } + if (t.level2_size << t.q > 8) + fprintf (stream, "\n "); + fprintf (stream, " },\n"); + fprintf (stream, " {"); + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + for (i = 0; i < t.level3_size << t.p; i++) + { + if (i > 0 && (i % 8) == 0) + fprintf (stream, "\n "); + fprintf (stream, " %5d", ((int32_t *) (t.result + level3_offset))[i]); + if (i+1 < t.level3_size << t.p) + fprintf (stream, ","); + } + if (t.level3_size << t.p > 8) + fprintf (stream, "\n "); + fprintf (stream, " }\n"); + fprintf (stream, "};\n"); + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* A special casing context. + A context is negated through x -> -x. */ +enum +{ + SCC_ALWAYS = 0, + SCC_FINAL_SIGMA, + SCC_AFTER_SOFT_DOTTED, + SCC_MORE_ABOVE, + SCC_BEFORE_DOT, + SCC_AFTER_I +}; + +/* A special casing rule. */ +struct special_casing_rule +{ + unsigned int code; + unsigned int lower_mapping[3]; + unsigned int title_mapping[3]; + unsigned int upper_mapping[3]; + unsigned int casefold_mapping[3]; + const char *language; + int context; +}; + +/* The special casing rules. */ +struct special_casing_rule **casing_rules; +unsigned int num_casing_rules; +unsigned int allocated_casing_rules; + +static void +add_casing_rule (struct special_casing_rule *new_rule) +{ + if (num_casing_rules == allocated_casing_rules) + { + allocated_casing_rules = 2 * allocated_casing_rules; + if (allocated_casing_rules < 16) + allocated_casing_rules = 16; + casing_rules = + (struct special_casing_rule **) + realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *)); + } + casing_rules[num_casing_rules++] = new_rule; +} + +/* Stores in casing_rules the special casing rules found in + specialcasing_filename. */ +static void +fill_casing_rules (const char *specialcasing_filename) +{ + FILE *stream; + + stream = fopen (specialcasing_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename); + exit (1); + } + + casing_rules = NULL; + num_casing_rules = 0; + allocated_casing_rules = 0; + + for (;;) + { + char buf[200+1]; + char *scanptr; + char *endptr; + int i; + + unsigned int code; + unsigned int lower_mapping[3]; + unsigned int title_mapping[3]; + unsigned int upper_mapping[3]; + char *language; + int context; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + /* Scan code. */ + scanptr = buf; + code = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan lower mapping. */ + for (i = 0; i < 3; i++) + lower_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + lower_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan title mapping. */ + for (i = 0; i < 3; i++) + title_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + title_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan upper mapping. */ + for (i = 0; i < 3; i++) + upper_mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + upper_mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + scanptr++; + + /* Scan language and context. */ + language = NULL; + context = SCC_ALWAYS; + while (*scanptr == ' ') + scanptr++; + if (*scanptr != '\0' && *scanptr != '#') + { + const char *word_begin = scanptr; + const char *word_end; + + while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') + scanptr++; + word_end = scanptr; + + while (*scanptr == ' ') + scanptr++; + + if (word_end - word_begin == 2) + { + language = (char *) malloc ((word_end - word_begin) + 1); + memcpy (language, word_begin, 2); + language[word_end - word_begin] = '\0'; + word_begin = word_end = NULL; + + if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') + { + word_begin = scanptr; + while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ') + scanptr++; + word_end = scanptr; + } + } + + if (word_end > word_begin) + { + bool negate = false; + + if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0) + { + word_begin += 4; + negate = true; + } + if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0) + context = SCC_FINAL_SIGMA; + else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0) + context = SCC_AFTER_SOFT_DOTTED; + else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0) + context = SCC_MORE_ABOVE; + else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0) + context = SCC_BEFORE_DOT; + else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0) + context = SCC_AFTER_I; + else + { + fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename); + exit (1); + } + if (negate) + context = - context; + } + + if (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", specialcasing_filename); + exit (1); + } + } + + /* Store the rule. */ + { + struct special_casing_rule *new_rule = + (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); + new_rule->code = code; + new_rule->language = language; + new_rule->context = context; + memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping)); + memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping)); + memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping)); + + add_casing_rule (new_rule); + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", specialcasing_filename); + exit (1); + } +} + +/* A casefolding rule. */ +struct casefold_rule +{ + unsigned int code; + unsigned int mapping[3]; + const char *language; +}; + +/* The casefolding rules. */ +struct casefold_rule **casefolding_rules; +unsigned int num_casefolding_rules; +unsigned int allocated_casefolding_rules; + +/* Stores in casefolding_rules the case folding rules found in + casefolding_filename. */ +static void +fill_casefolding_rules (const char *casefolding_filename) +{ + FILE *stream; + + stream = fopen (casefolding_filename, "r"); + if (stream == NULL) + { + fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename); + exit (1); + } + + casefolding_rules = NULL; + num_casefolding_rules = 0; + allocated_casefolding_rules = 0; + + for (;;) + { + char buf[200+1]; + char *scanptr; + char *endptr; + int i; + + unsigned int code; + char type; + unsigned int mapping[3]; + + if (fscanf (stream, "%200[^\n]\n", buf) < 1) + break; + + if (buf[0] == '\0' || buf[0] == '#') + continue; + + /* Scan code. */ + scanptr = buf; + code = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr = endptr; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Scan type. */ + while (*scanptr == ' ') + scanptr++; + + switch (*scanptr) + { + case 'C': case 'F': case 'S': case 'T': + type = *scanptr; + break; + default: + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Scan casefold mapping. */ + for (i = 0; i < 3; i++) + mapping[i] = 0; + for (i = 0; i < 3; i++) + { + while (*scanptr == ' ') + scanptr++; + if (*scanptr == ';') + break; + mapping[i] = strtoul (scanptr, &endptr, 16); + if (endptr == scanptr) + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr = endptr; + } + if (*scanptr != ';') + { + fprintf (stderr, "parse error in '%s'\n", casefolding_filename); + exit (1); + } + scanptr++; + + /* Ignore rules of type 'S'; we use the rules of type 'F' instead. */ + if (type != 'S') + { + const char * const *languages; + unsigned int languages_count; + + /* Type 'T' indicates that the rule is applicable to Turkish + languages only. */ + if (type == 'T') + { + static const char * const turkish_languages[] = { "tr", "az" }; + languages = turkish_languages; + languages_count = 2; + } + else + { + static const char * const all_languages[] = { NULL }; + languages = all_languages; + languages_count = 1; + } + + for (i = 0; i < languages_count; i++) + { + /* Store a new rule. */ + struct casefold_rule *new_rule = + (struct casefold_rule *) malloc (sizeof (struct casefold_rule)); + new_rule->code = code; + memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping)); + new_rule->language = languages[i]; + + if (num_casefolding_rules == allocated_casefolding_rules) + { + allocated_casefolding_rules = 2 * allocated_casefolding_rules; + if (allocated_casefolding_rules < 16) + allocated_casefolding_rules = 16; + casefolding_rules = + (struct casefold_rule **) + realloc (casefolding_rules, + allocated_casefolding_rules * sizeof (struct casefold_rule *)); + } + casefolding_rules[num_casefolding_rules++] = new_rule; + } + } + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error reading from '%s'\n", casefolding_filename); + exit (1); + } +} + +/* Casefold mapping, when it maps to a single character. */ +unsigned int unicode_casefold[0x110000]; + +static unsigned int +to_casefold (unsigned int ch) +{ + return unicode_casefold[ch]; +} + +/* Redistribute the casefolding_rules: + - Rules that map to a single character, language independently, are stored + in unicode_casefold. + - Other rules are merged into casing_rules. */ +static void +redistribute_casefolding_rules (void) +{ + unsigned int ch, i, j; + + /* Fill unicode_casefold[]. */ + for (ch = 0; ch < 0x110000; ch++) + unicode_casefold[ch] = ch; + for (i = 0; i < num_casefolding_rules; i++) + { + struct casefold_rule *cfrule = casefolding_rules[i]; + + if (cfrule->language == NULL && cfrule->mapping[1] == 0) + { + ch = cfrule->code; + if (!(ch < 0x110000)) + abort (); + unicode_casefold[ch] = cfrule->mapping[0]; + } + } + + /* Extend the special casing rules by filling in their casefold_mapping[] + field. */ + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + unsigned int k; + + rule->casefold_mapping[0] = to_casefold (rule->code); + for (k = 1; k < 3; k++) + rule->casefold_mapping[k] = 0; + } + + /* Now merge the other casefolding rules into casing_rules. */ + for (i = 0; i < num_casefolding_rules; i++) + { + struct casefold_rule *cfrule = casefolding_rules[i]; + + if (!(cfrule->language == NULL && cfrule->mapping[1] == 0)) + { + /* Find a rule that applies to the same code, same language, and it + has context SCC_ALWAYS. At the same time, update all rules that + have the same code and same or more specific language. */ + struct special_casing_rule *found_rule = NULL; + + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + + if (rule->code == cfrule->code + && (cfrule->language == NULL + || (rule->language != NULL + && strcmp (rule->language, cfrule->language) == 0))) + { + memcpy (rule->casefold_mapping, cfrule->mapping, + sizeof (rule->casefold_mapping)); + + if ((cfrule->language == NULL + ? rule->language == NULL + : rule->language != NULL + && strcmp (rule->language, cfrule->language) == 0) + && rule->context == SCC_ALWAYS) + { + /* Found it. */ + found_rule = rule; + } + } + } + + if (found_rule == NULL) + { + /* Create a new rule. */ + struct special_casing_rule *new_rule = + (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule)); + + /* Try to find a rule that applies to the same code, no language + restriction, and with context SCC_ALWAYS. */ + for (j = 0; j < num_casing_rules; j++) + { + struct special_casing_rule *rule = casing_rules[j]; + + if (rule->code == cfrule->code + && rule->context == SCC_ALWAYS + && rule->language == NULL) + { + /* Found it. */ + found_rule = rule; + break; + } + } + + new_rule->code = cfrule->code; + new_rule->language = cfrule->language; + new_rule->context = SCC_ALWAYS; + if (found_rule != NULL) + { + memcpy (new_rule->lower_mapping, found_rule->lower_mapping, + sizeof (new_rule->lower_mapping)); + memcpy (new_rule->title_mapping, found_rule->title_mapping, + sizeof (new_rule->title_mapping)); + memcpy (new_rule->upper_mapping, found_rule->upper_mapping, + sizeof (new_rule->upper_mapping)); + } + else + { + unsigned int k; + + new_rule->lower_mapping[0] = to_lower (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->lower_mapping[k] = 0; + new_rule->title_mapping[0] = to_title (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->title_mapping[k] = 0; + new_rule->upper_mapping[0] = to_upper (cfrule->code); + for (k = 1; k < 3; k++) + new_rule->upper_mapping[k] = 0; + } + memcpy (new_rule->casefold_mapping, cfrule->mapping, + sizeof (new_rule->casefold_mapping)); + + add_casing_rule (new_rule); + } + } + } +} + +static int +compare_casing_rules (const void *a, const void *b) +{ + struct special_casing_rule *a_rule = *(struct special_casing_rule **) a; + struct special_casing_rule *b_rule = *(struct special_casing_rule **) b; + unsigned int a_code = a_rule->code; + unsigned int b_code = b_rule->code; + + if (a_code < b_code) + return -1; + if (a_code > b_code) + return 1; + + /* Sort the more specific rules before the more general ones. */ + return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0)) + + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0))); +} + +static void +sort_casing_rules (void) +{ + /* Sort the rules 1. by code, 2. by specificity. */ + if (num_casing_rules > 1) + qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *), + compare_casing_rules); +} + +/* Output the special casing rules. */ +static void +output_casing_rules (const char *filename, const char *version) +{ + FILE *stream; + unsigned int i, j; + unsigned int minor; + + stream = fopen (filename, "w"); + if (stream == NULL) + { + fprintf (stderr, "cannot open '%s' for writing\n", filename); + exit (1); + } + + fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n"); + fprintf (stream, "/* Special casing rules of Unicode characters. */\n"); + fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s. */\n", + version); + fprintf (stream, "struct special_casing_rule { char code[3]; };\n"); + fprintf (stream, "%%struct-type\n"); + fprintf (stream, "%%language=ANSI-C\n"); + fprintf (stream, "%%define slot-name code\n"); + fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n"); + fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n"); + fprintf (stream, "%%compare-lengths\n"); + fprintf (stream, "%%compare-strncmp\n"); + fprintf (stream, "%%readonly-tables\n"); + fprintf (stream, "%%omit-struct-type\n"); + fprintf (stream, "%%%%\n"); + + minor = 0; + for (i = 0; i < num_casing_rules; i++) + { + struct special_casing_rule *rule = casing_rules[i]; + int context; + + if (i > 0 && rule->code == casing_rules[i - 1]->code) + minor += 1; + else + minor = 0; + + if (!(rule->code < 0x10000)) + { + fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code); + exit (1); + } + + fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ", + (rule->code >> 8) & 0xff, rule->code & 0xff, minor); + + fprintf (stream, "%d, ", + i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0); + + context = rule->context; + if (context < 0) + { + fprintf (stream, "-"); + context = - context; + } + else + fprintf (stream, " "); + switch (context) + { + case SCC_ALWAYS: + fprintf (stream, "SCC_ALWAYS "); + break; + case SCC_FINAL_SIGMA: + fprintf (stream, "SCC_FINAL_SIGMA "); + break; + case SCC_AFTER_SOFT_DOTTED: + fprintf (stream, "SCC_AFTER_SOFT_DOTTED"); + break; + case SCC_MORE_ABOVE: + fprintf (stream, "SCC_MORE_ABOVE "); + break; + case SCC_BEFORE_DOT: + fprintf (stream, "SCC_BEFORE_DOT "); + break; + case SCC_AFTER_I: + fprintf (stream, "SCC_AFTER_I "); + break; + default: + abort (); + } + fprintf (stream, ", "); + + if (rule->language != NULL) + { + if (strlen (rule->language) != 2) + abort (); + fprintf (stream, "{ '%c', '%c' }, ", rule->language[0], rule->language[1]); + } + else + fprintf (stream, "{ '\\0', '\\0' }, "); + + fprintf (stream, "{ "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->upper_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->upper_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->upper_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->lower_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->lower_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->lower_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->title_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->title_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->title_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }, { "); + for (j = 0; j < 3; j++) + { + if (j > 0) + fprintf (stream, ", "); + if (!(rule->casefold_mapping[j] < 0x10000)) + { + fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code); + exit (1); + } + if (rule->casefold_mapping[j] != 0) + fprintf (stream, "0x%04X", rule->casefold_mapping[j]); + else + fprintf (stream, " 0"); + } + fprintf (stream, " }\n"); + } + + if (ferror (stream) || fclose (stream)) + { + fprintf (stderr, "error writing to '%s'\n", filename); + exit (1); + } +} + +/* ========================================================================= */ + +/* Quoting the Unicode standard: + Definition: A character is defined to be "cased" if it has the Lowercase + or Uppercase property or has a General_Category value of + Titlecase_Letter. */ +static bool +is_cased (unsigned int ch) +{ + return (is_property_lowercase (ch) + || is_property_uppercase (ch) + || is_category_Lt (ch)); +} + +/* Quoting the Unicode standard: + Definition: A character is defined to be "case-ignorable" if it has the + value MidLetter {or the value MidNumLet} for the Word_Break property or + its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), + Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk). + The text marked in braces was added in Unicode 5.1.0, see + <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of + Definition of case-ignorable". */ +/* Since this predicate is only used for the "Before C" and "After C" + conditions of FINAL_SIGMA, we exclude the "cased" characters here. + This simplifies the evaluation of the regular expressions + \p{cased} (\p{case-ignorable})* C + and + C (\p{case-ignorable})* \p{cased} + */ +static bool +is_case_ignorable (unsigned int ch) +{ + return (unicode_org_wbp[ch] == WBP_MIDLETTER + || unicode_org_wbp[ch] == WBP_MIDNUMLET + || is_category_Mn (ch) + || is_category_Me (ch) + || is_category_Cf (ch) + || is_category_Lm (ch) + || is_category_Sk (ch)) + && !is_cased (ch); +} + +/* ------------------------------------------------------------------------- */ + +/* Output all case related properties. */ +static void +output_casing_properties (const char *version) +{ +#define PROPERTY(FN,P) \ + debug_output_predicate ("unicase/" #FN ".txt", is_ ## P); \ + output_predicate_test ("../tests/unicase/test-" #FN ".c", is_ ## P, "uc_is_" #P " (c)"); \ + output_predicate ("unicase/" #FN ".h", is_ ## P, "u_casing_property_" #P, "Casing Properties", version); + PROPERTY(cased, cased) + PROPERTY(ignorable, case_ignorable) +#undef PROPERTY +} + +/* ========================================================================= */ + +int +main (int argc, char * argv[]) +{ + const char *unicodedata_filename; + const char *proplist_filename; + const char *derivedproplist_filename; + const char *arabicshaping_filename; + const char *scripts_filename; + const char *blocks_filename; + const char *proplist30_filename; + const char *eastasianwidth_filename; + const char *linebreak_filename; + const char *wordbreakproperty_filename; + const char *graphemebreakproperty_filename; + const char *compositionexclusions_filename; + const char *specialcasing_filename; + const char *casefolding_filename; + const char *version; + + if (argc != 16) + { + fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt ArabicShaping.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt GraphemeBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n", + argv[0]); + exit (1); + } + + unicodedata_filename = argv[1]; + proplist_filename = argv[2]; + derivedproplist_filename = argv[3]; + arabicshaping_filename = argv[4]; + scripts_filename = argv[5]; + blocks_filename = argv[6]; + proplist30_filename = argv[7]; + eastasianwidth_filename = argv[8]; + linebreak_filename = argv[9]; + wordbreakproperty_filename = argv[10]; + graphemebreakproperty_filename = argv[11]; + compositionexclusions_filename = argv[12]; + specialcasing_filename = argv[13]; + casefolding_filename = argv[14]; + version = argv[15]; + + fill_attributes (unicodedata_filename); + clear_properties (); + fill_properties (proplist_filename); + fill_properties (derivedproplist_filename); + fill_properties30 (proplist30_filename); + fill_arabicshaping (arabicshaping_filename); + fill_scripts (scripts_filename); + fill_blocks (blocks_filename); + fill_width (eastasianwidth_filename); + fill_org_lbp (linebreak_filename); + fill_org_wbp (wordbreakproperty_filename); + fill_org_gbp (graphemebreakproperty_filename); + fill_composition_exclusions (compositionexclusions_filename); + fill_casing_rules (specialcasing_filename); + fill_casefolding_rules (casefolding_filename); + redistribute_casefolding_rules (); + sort_casing_rules (); + + output_categories (version); + output_category ("unictype/categ_of.h", version); + output_combclass ("unictype/combiningclass.h", version); + output_bidi_category ("unictype/bidi_of.h", version); + output_decimal_digit_test ("../tests/unictype/test-decdigit.h", version); + output_decimal_digit ("unictype/decdigit.h", version); + output_digit_test ("../tests/unictype/test-digit.h", version); + output_digit ("unictype/digit.h", version); + output_numeric_test ("../tests/unictype/test-numeric.h", version); + output_numeric ("unictype/numeric.h", version); + output_mirror ("unictype/mirror.h", version); + output_properties (version); + output_joining_type_test ("../tests/unictype/test-joiningtype_of.h", version); + output_joining_type ("unictype/joiningtype_of.h", version); + output_joining_group_test ("../tests/unictype/test-joininggroup_of.h", version); + output_joining_group ("unictype/joininggroup_of.h", version); + + output_scripts (version); + output_scripts_byname (version); + output_blocks (version); + output_ident_properties (version); + output_nonspacing_property ("uniwidth/width.c.part"); + output_width_property_test ("../tests/uniwidth/test-uc_width2.sh.part"); + output_old_ctype (version); + + debug_output_lbrk_tables ("unilbrk/lbrkprop.txt"); + debug_output_org_lbrk_tables ("unilbrk/lbrkprop_org.txt"); + output_lbrk_tables ("unilbrk/lbrkprop1.h", "unilbrk/lbrkprop2.h", version); + + debug_output_wbrk_tables ("uniwbrk/wbrkprop.txt"); + debug_output_org_wbrk_tables ("uniwbrk/wbrkprop_org.txt"); + output_wbrk_tables ("uniwbrk/wbrkprop.h", version); + + output_gbp_test ("../tests/unigbrk/test-uc-gbrk-prop.h"); + output_gbp_table ("unigbrk/gbrkprop.h", version); + + output_decomposition_tables ("uninorm/decomposition-table1.h", "uninorm/decomposition-table2.h", version); + debug_output_composition_tables ("uninorm/composition.txt"); + output_composition_tables ("uninorm/composition-table.gperf", version); + + output_simple_mapping_test ("../tests/unicase/test-uc_toupper.c", "uc_toupper", to_upper, version); + output_simple_mapping_test ("../tests/unicase/test-uc_tolower.c", "uc_tolower", to_lower, version); + output_simple_mapping_test ("../tests/unicase/test-uc_totitle.c", "uc_totitle", to_title, version); + output_simple_mapping ("unicase/toupper.h", to_upper, version); + output_simple_mapping ("unicase/tolower.h", to_lower, version); + output_simple_mapping ("unicase/totitle.h", to_title, version); + output_simple_mapping ("unicase/tocasefold.h", to_casefold, version); + output_casing_rules ("unicase/special-casing-table.gperf", version); + output_casing_properties (version); + + return 0; +} + +/* + * For Emacs M-x compile + * Local Variables: + * compile-command: " + gcc -O -Wall gen-uni-tables.c -Iunictype -o gen-uni-tables && \ + ./gen-uni-tables \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/UnicodeData.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/PropList.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/DerivedCoreProperties.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/ArabicShaping.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Scripts.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/Blocks.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/3.0.1/PropList-3.0.1.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/EastAsianWidth.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/LineBreak.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/WordBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/auxiliary/GraphemeBreakProperty.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CompositionExclusions.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/SpecialCasing.txt \ + /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/6.0.0/ucd/CaseFolding.txt \ + 6.0.0 \ + && diff unilbrk/lbrkprop_org.txt unilbrk/lbrkprop.txt \ + && diff uniwbrk/wbrkprop_org.txt uniwbrk/wbrkprop.txt + " + * End: + */ |