summaryrefslogtreecommitdiff
path: root/Tools/unicode/makeunicodedata.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/unicode/makeunicodedata.py')
-rw-r--r--Tools/unicode/makeunicodedata.py135
1 files changed, 77 insertions, 58 deletions
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index b2615eefcd..0783f1735d 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -25,17 +25,17 @@
# written by Fredrik Lundh (fredrik@pythonware.com)
#
-import sys
+import sys, os, zipfile
SCRIPT = sys.argv[0]
VERSION = "3.2"
# The Unicode Database
-UNIDATA_VERSION = "5.2.0"
+UNIDATA_VERSION = "6.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
-UNIHAN = "Unihan%s.txt"
+UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
@@ -75,23 +75,13 @@ def maketables(trace=0):
print("--- Reading", UNICODE_DATA % "", "...")
version = ""
- unicode = UnicodeData(UNICODE_DATA % version,
- COMPOSITION_EXCLUSIONS % version,
- EASTASIAN_WIDTH % version,
- UNIHAN % version,
- DERIVED_CORE_PROPERTIES % version,
- DERIVEDNORMALIZATION_PROPS % version,
- LINE_BREAK % version)
+ unicode = UnicodeData(UNIDATA_VERSION)
print(len(list(filter(None, unicode.table))), "characters")
for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
- old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
- COMPOSITION_EXCLUSIONS % ("-"+version),
- EASTASIAN_WIDTH % ("-"+version),
- UNIHAN % ("-"+version),
- DERIVED_CORE_PROPERTIES % ("-"+version))
+ old_unicode = UnicodeData(version)
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)
@@ -771,6 +761,10 @@ def merge_old_version(version, new, old):
elif k == 16:
# derived property changes; not yet
pass
+ elif k == 17:
+ # normalization quickchecks are not performed
+ # for older versions
+ pass
else:
class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i])
@@ -779,6 +773,21 @@ def merge_old_version(version, new, old):
numeric_changes)),
normalization_changes))
+def open_data(template, version):
+ local = template % ('-'+version,)
+ if not os.path.exists(local):
+ import urllib.request
+ if version == '3.2.0':
+ # irregular url structure
+ url = 'http://www.unicode.org/Public/3.2-Update/' + local
+ else:
+ url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
+ urllib.request.urlretrieve(url, filename=local)
+ if local.endswith('.txt'):
+ return open(local, encoding='utf-8')
+ else:
+ # Unihan.zip
+ return open(local, 'rb')
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
@@ -793,11 +802,11 @@ class UnicodeData:
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17)
- def __init__(self, filename, exclusions, eastasianwidth, unihan,
- derivedprops, derivednormalizationprops=None, linebreakprops=None,
+ def __init__(self, version,
+ linebreakprops=False,
expand=1):
self.changed = []
- file = open(filename)
+ file = open_data(UNICODE_DATA, version)
table = [None] * 0x110000
while 1:
s = file.readline()
@@ -825,11 +834,11 @@ class UnicodeData:
table[i] = f2
# public attributes
- self.filename = filename
+ self.filename = UNICODE_DATA % ''
self.table = table
self.chars = list(range(0x110000)) # unicode 3.2
- file = open(exclusions)
+ file = open_data(COMPOSITION_EXCLUSIONS, version)
self.exclusions = {}
for s in file:
s = s.strip()
@@ -841,7 +850,7 @@ class UnicodeData:
self.exclusions[char] = 1
widths = [None] * 0x110000
- for s in open(eastasianwidth):
+ for s in open_data(EASTASIAN_WIDTH, version):
s = s.strip()
if not s:
continue
@@ -862,7 +871,7 @@ class UnicodeData:
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(set())
- for s in open(derivedprops):
+ for s in open_data(DERIVED_CORE_PROPERTIES, version):
s = s.split('#', 1)[0].strip()
if not s:
continue
@@ -881,43 +890,53 @@ class UnicodeData:
# apply to unassigned code points; ignore them
table[char][-1].add(p)
- if linebreakprops:
- for s in open(linebreakprops):
- s = s.partition('#')[0]
- s = [i.strip() for i in s.split(';')]
- if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
- continue
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
- table[char][-1].add('Line_Break')
-
- if derivednormalizationprops:
- quickchecks = [0] * 0x110000 # default is Yes
- qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
- for s in open(derivednormalizationprops):
- if '#' in s:
- s = s[:s.index('#')]
- s = [i.strip() for i in s.split(';')]
- if len(s) < 2 or s[1] not in qc_order:
- continue
- quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
- quickcheck_shift = qc_order.index(s[1])*2
- quickcheck <<= quickcheck_shift
- if '..' not in s[0]:
- first = last = int(s[0], 16)
- else:
- first, last = [int(c, 16) for c in s[0].split('..')]
- for char in range(first, last+1):
- assert not (quickchecks[char]>>quickcheck_shift)&3
- quickchecks[char] |= quickcheck
- for i in range(0, 0x110000):
- if table[i] is not None:
- table[i].append(quickchecks[i])
+ for s in open_data(LINE_BREAK, version):
+ s = s.partition('#')[0]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+ continue
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ table[char][-1].add('Line_Break')
+
+ # We only want the quickcheck properties
+ # Format: NF?_QC; Y(es)/N(o)/M(aybe)
+ # Yes is the default, hence only N and M occur
+ # In 3.2.0, the format was different (NF?_NO)
+ # The parsing will incorrectly determine these as
+ # "yes", however, unicodedata.c will not perform quickchecks
+ # for older versions, and no delta records will be created.
+ quickchecks = [0] * 0x110000
+ qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+ for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
+ if '#' in s:
+ s = s[:s.index('#')]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in qc_order:
+ continue
+ quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+ quickcheck_shift = qc_order.index(s[1])*2
+ quickcheck <<= quickcheck_shift
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ assert not (quickchecks[char]>>quickcheck_shift)&3
+ quickchecks[char] |= quickcheck
+ for i in range(0, 0x110000):
+ if table[i] is not None:
+ table[i].append(quickchecks[i])
- for line in open(unihan, encoding='utf-8'):
+ zip = zipfile.ZipFile(open_data(UNIHAN, version))
+ if version == '3.2.0':
+ data = zip.open('Unihan-3.2.0.txt').read()
+ else:
+ data = zip.open('Unihan_NumericValues.txt').read()
+ for line in data.decode("utf-8").splitlines():
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]